POD fixes
[bioperl-live.git] / Bio / SimpleAlign.pm
blob1f8e3f89e41fc0608832d3c3683745871d43321d
1 # BioPerl module for SimpleAlign
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 # History:
14 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
15 # May 2001 major rewrite - Heikki Lehvaslaiho
17 =head1 NAME
19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
21 =head1 SYNOPSIS
23 # Use Bio::AlignIO to read in the alignment
24 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
25 $aln = $str->next_aln();
27 # Describe
28 print $aln->length;
29 print $aln->num_residues;
30 print $aln->is_flush;
31 print $aln->num_sequences;
32 print $aln->score;
33 print $aln->percentage_identity;
34 print $aln->consensus_string(50);
36 # Find the position in the alignment for a sequence location
37 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
39 # Extract sequences and check values for the alignment column $pos
40 foreach $seq ($aln->each_seq) {
41 $res = $seq->subseq($pos, $pos);
42 $count{$res}++;
44 foreach $res (keys %count) {
45 printf "Res: %s Count: %2d\n", $res, $count{$res};
48 # Manipulate
49 $aln->remove_seq($seq);
50 $mini_aln = $aln->slice(20,30); # get a block of columns
51 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
52 $new_aln = $aln->remove_columns([20,30]); # remove by position
53 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
55 # Analyze
56 $str = $aln->consensus_string($threshold_percent);
57 $str = $aln->match_line();
58 $str = $aln->cigar_line();
59 $id = $aln->percentage_identity;
61 # See the module documentation for details and more methods.
63 =head1 DESCRIPTION
65 SimpleAlign is an object that handles a multiple sequence alignment
66 (MSA). It is very permissive of types (it does not insist on sequences
67 being all same length, for example). Think of it as a set of sequences
68 with a whole series of built-in manipulations and methods for reading and
69 writing alignments.
71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
72 to store its sequences. These are subsequences with a start and end
73 positions in the parent reference sequence. Each sequence in the
74 SimpleAlign object is a Bio::LocatableSeq.
76 SimpleAlign expects the combination of name, start, and end for a
77 given sequence to be unique in the alignment, and this is the key for the
78 internal hashes (name, start, end are abbreviated C<nse> in the code).
79 However, in some cases people do not want the name/start-end to be displayed:
80 either multiple names in an alignment or names specific to the alignment
81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
82 C<displayname>, and generally is what is used to print out the
83 alignment. They default to name/start-end.
85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Support
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
113 web:
115 https://redmine.open-bio.org/projects/bioperl/
117 =head1 AUTHOR
119 Ewan Birney, birney@ebi.ac.uk
121 =head1 CONTRIBUTORS
123 Allen Day, allenday-at-ucla.edu,
124 Richard Adams, Richard.Adams-at-ed.ac.uk,
125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
127 Allen Smith, allens-at-cpan.org,
128 Jason Stajich, jason-at-bioperl.org,
129 Anthony Underwood, aunderwood-at-phls.org.uk,
130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
131 Brian Osborne, bosborne at alum.mit.edu
132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
133 Hongyu Zhang, forward at hongyu.org
134 Jay Hannah, jay at jays.net
135 Alexandr Bezginov, albezg at gmail.com
137 =head1 SEE ALSO
139 L<Bio::LocatableSeq>
141 =head1 APPENDIX
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
146 =cut
148 # 'Let the code begin...
150 package Bio::SimpleAlign;
151 use vars qw(%CONSERVATION_GROUPS);
152 use strict;
154 use Bio::LocatableSeq; # uses Seq's as list
156 use Bio::Seq;
157 use Bio::SeqFeature::Generic;
159 BEGIN {
160 # This data should probably be in a more centralized module...
161 # it is taken from Clustalw documentation.
162 # These are all the positively scoring groups that occur in the
163 # Gonnet Pam250 matrix. The strong and weak groups are
164 # defined as strong score >0.5 and weak score =<0.5 respectively.
166 %CONSERVATION_GROUPS = (
167 'strong' => [ qw(
169 NEQK
170 NHQK
171 NDEQ
172 QHRK
173 MILV
174 MILF
176 FYW )],
177 'weak' => [ qw(
181 STNK
182 STPA
183 SGND
184 SNDEQK
185 NDEQHK
186 NEQHRK
187 FVLIM
188 HFY )],);
191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
192 Bio::FeatureHolderI);
194 =head2 new
196 Title : new
197 Usage : my $aln = Bio::SimpleAlign->new();
198 Function : Creates a new simple align object
199 Returns : Bio::SimpleAlign
200 Args : -source => string representing the source program
201 where this alignment came from
202 -annotation => Bio::AnnotationCollectionI
203 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
204 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
205 -consensus => consensus string
206 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
208 =cut
211 sub new {
212 my($class,@args) = @_;
214 my $self = $class->SUPER::new(@args);
216 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
217 SOURCE
218 SCORE
220 ACCESSION
221 DESCRIPTION
222 SEQS
223 FEATURES
224 ANNOTATION
225 SEQ_ANNOTATION
226 CONSENSUS
227 CONSENSUS_META
228 )], @args);
229 $src && $self->source($src);
230 defined $score && $self->score($score);
231 # we need to set up internal hashs first!
233 $self->{'_seq'} = {};
234 $self->{'_order'} = {};
235 $self->{'_start_end_lists'} = {};
236 $self->{'_dis_name'} = {};
237 $self->{'_id'} = 'NoName';
238 # maybe we should automatically read in from args. Hmmm...
239 $id && $self->id($id);
240 $acc && $self->accession($acc);
241 $desc && $self->description($desc);
242 $coll && $self->annotation($coll);
243 # sequence annotation is layered into a provided annotation collection (or dies)
244 if ($sa) {
245 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
246 "with a sequence annotation collection")
247 if !$coll;
248 $coll->add_Annotation('seq_annotation', $sa);
250 if ($feats && ref $feats eq 'ARRAY') {
251 for my $feat (@$feats) {
252 $self->add_SeqFeature($feat);
255 $con && $self->consensus($con);
256 $cmeta && $self->consensus_meta($cmeta);
257 # assumes these are in correct alignment order
258 if ($seqs && ref($seqs) eq 'ARRAY') {
259 for my $seq (@$seqs) {
260 $self->add_seq($seq);
264 return $self; # success - we hope!
267 =head1 Modifier methods
269 These methods modify the MSA by adding, removing or shuffling complete
270 sequences.
272 =head2 add_seq
274 Title : add_seq
275 Usage : $myalign->add_seq($newseq);
276 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
277 Function : Adds another sequence to the alignment. *Does not* align
278 it - just adds it to the hashes.
279 If -ORDER is specified, the sequence is inserted at the
280 the position spec'd by -ORDER, and existing sequences
281 are pushed down the storage array.
282 Returns : nothing
283 Args : A Bio::LocatableSeq object
284 Positive integer for the sequence position (optional)
286 See L<Bio::LocatableSeq> for more information
288 =cut
290 sub addSeq {
291 my $self = shift;
292 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
293 $self->add_seq(@_);
296 sub add_seq {
297 my $self = shift;
298 my @args = @_;
299 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
300 my ($name,$id,$start,$end);
302 unless ($seq) {
303 $self->throw("LocatableSeq argument required");
305 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
306 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
308 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
309 $order--; # jay's patch (user-specified order is 1-origin)
311 if ($order < 0) {
312 $self->throw("User-specified value for ORDER must be >= 1");
315 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
317 # build the symbol list for this sequence,
318 # will prune out the gap and missing/match chars
319 # when actually asked for the symbol list in the
320 # symbol_chars
321 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
323 $name = $seq->get_nse;
325 if( $self->{'_seq'}->{$name} ) {
326 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
328 else {
329 $self->debug( "Assigning $name to $order\n");
331 my $ordh = $self->{'_order'};
332 if ($ordh->{$order}) {
333 # make space to insert
334 # $c->() returns (in reverse order) the first subsequence
335 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
336 # (3,2,1), and $c->(2,4,5) returns (2).
337 my $c;
338 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
339 map {
340 $ordh->{$_+1} = $ordh->{$_}
341 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
344 $ordh->{$order} = $name;
346 unless( exists( $self->{'_start_end_lists'}->{$id})) {
347 $self->{'_start_end_lists'}->{$id} = [];
349 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
352 $self->{'_seq'}->{$name} = $seq;
357 =head2 remove_seq
359 Title : remove_seq
360 Usage : $aln->remove_seq($seq);
361 Function : Removes a single sequence from an alignment
362 Returns :
363 Argument : a Bio::LocatableSeq object
365 =cut
367 sub removeSeq {
368 my $self = shift;
369 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
370 $self->remove_seq(@_);
373 sub remove_seq {
374 my $self = shift;
375 my $seq = shift;
376 my ($name,$id);
378 $self->throw("Need Bio::Locatable seq argument ")
379 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
381 $id = $seq->id();
382 $name = $seq->get_nse;
384 if( !exists $self->{'_seq'}->{$name} ) {
385 $self->throw("Sequence $name does not exist in the alignment to remove!");
388 delete $self->{'_seq'}->{$name};
390 # we need to remove this seq from the start_end_lists hash
392 if (exists $self->{'_start_end_lists'}->{$id}) {
393 # we need to find the sequence in the array.
395 my ($i, $found);;
396 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
397 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
398 $found = 1;
399 last;
402 if ($found) {
403 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
405 else {
406 $self->throw("Could not find the sequence to remoce from the start-end list");
409 else {
410 $self->throw("There is no seq list for the name $id");
412 # we need to shift order hash
413 my %rev_order = reverse %{$self->{'_order'}};
414 my $no = $rev_order{$name};
415 my $num_sequences = $self->num_sequences;
416 for (; $no < $num_sequences; $no++) {
417 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
419 delete $self->{'_order'}->{$no};
420 return 1;
424 =head2 purge
426 Title : purge
427 Usage : $aln->purge(0.7);
428 Function: Removes sequences above given sequence similarity
429 This function will grind on large alignments. Beware!
430 Example :
431 Returns : An array of the removed sequences
432 Args : float, threshold for similarity
434 =cut
436 sub purge {
437 my ($self,$perc) = @_;
438 my (%duplicate, @dups);
440 my @seqs = $self->each_seq();
442 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
443 my $seq = $seqs[$i];
445 #skip if already in duplicate hash
446 next if exists $duplicate{$seq->display_id} ;
447 my $one = $seq->seq();
449 my @one = split '', $one; #split to get 1aa per array element
451 for (my $j=$i+1;$j < @seqs;$j++) {
452 my $seq2 = $seqs[$j];
454 #skip if already in duplicate hash
455 next if exists $duplicate{$seq2->display_id} ;
457 my $two = $seq2->seq();
458 my @two = split '', $two;
460 my $count = 0;
461 my $res = 0;
462 for (my $k=0;$k<@one;$k++) {
463 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
464 $one[$k] eq $two[$k]) {
465 $count++;
467 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
468 $two[$k] ne '.' && $two[$k] ne '-' ) {
469 $res++;
473 my $ratio = 0;
474 $ratio = $count/$res unless $res == 0;
476 # if above threshold put in duplicate hash and push onto
477 # duplicate array for returning to get_unique
478 if ( $ratio > $perc ) {
479 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
480 $duplicate{$seq2->display_id} = 1;
481 push @dups, $seq2;
485 foreach my $seq (@dups) {
486 $self->remove_seq($seq);
488 return @dups;
491 =head2 sort_alphabetically
493 Title : sort_alphabetically
494 Usage : $ali->sort_alphabetically
495 Function : Changes the order of the alignment to alphabetical on name
496 followed by numerical by number.
497 Returns :
498 Argument :
500 =cut
502 sub sort_alphabetically {
503 my $self = shift;
504 my ($seq,$nse,@arr,%hash,$count);
506 foreach $seq ( $self->each_seq() ) {
507 $nse = $seq->get_nse;
508 $hash{$nse} = $seq;
511 $count = 0;
513 %{$self->{'_order'}} = (); # reset the hash;
515 foreach $nse ( sort _alpha_startend keys %hash) {
516 $self->{'_order'}->{$count} = $nse;
518 $count++;
523 =head2 sort_by_list
525 Title : sort_by_list
526 Usage : $aln_ordered=$aln->sort_by_list($list_file)
527 Function : Arbitrarily order sequences in an alignment
528 Returns : A new Bio::SimpleAlign object
529 Argument : a file listing sequence names in intended order (one name per line)
531 =cut
533 sub sort_by_list {
534 my ($self, $list) = @_;
535 my (@seq, @ids, %order);
537 foreach my $seq ( $self->each_seq() ) {
538 push @seq, $seq;
539 push @ids, $seq->display_id;
542 my $ct=1;
543 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
544 while (<$listfh>) {
545 chomp;
546 my $name=$_;
547 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
548 $order{$name}=$ct++;
550 close($listfh);
552 # use the map-sort-map idiom:
553 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
554 my $aln = $self->new;
555 foreach (@sorted) { $aln->add_seq($_) }
556 return $aln;
559 =head2 set_new_reference
561 Title : set_new_reference
562 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
563 the sequence whoes name is "B31" (full, exact, and case-sensitive),
564 as the reference (1st) sequence
565 Function : Change/Set a new reference (i.e., the first) sequence
566 Returns : a new Bio::SimpleAlign object.
567 Throws an exception if designated sequence not found
568 Argument : a positive integer of sequence order, or a sequence name
569 in the original alignment
571 =cut
573 sub set_new_reference {
574 my ($self, $seqid) = @_;
575 my $aln = $self->new;
576 my (@seq, @ids, @new_seq);
577 my $is_num=0;
578 foreach my $seq ( $self->each_seq() ) {
579 push @seq, $seq;
580 push @ids, $seq->display_id;
583 if ($seqid =~ /^\d+$/) { # argument is seq position
584 $is_num=1;
585 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
586 } else { # argument is a seq name
587 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
590 for (my $i=0; $i<=$#seq; $i++) {
591 my $pos=$i+1;
592 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
593 unshift @new_seq, $seq[$i];
594 } else {
595 push @new_seq, $seq[$i];
598 foreach (@new_seq) { $aln->add_seq($_); }
599 return $aln;
602 sub _in_aln { # check if input name exists in the alignment
603 my ($str, $ref) = @_;
604 foreach (@$ref) {
605 return 1 if $str eq $_;
607 return 0;
611 =head2 uniq_seq
613 Title : uniq_seq
614 Usage : $aln->uniq_seq(): Remove identical sequences in
615 in the alignment. Ambiguous base ("N", "n") and
616 leading and ending gaps ("-") are NOT counted as
617 differences.
618 Function : Make a new alignment of unique sequence types (STs)
619 Returns : 1a. if called in a scalar context,
620 a new Bio::SimpleAlign object (all sequences renamed as "ST")
621 1b. if called in an array context,
622 a new Bio::SimpleAlign object, and a hashref whose keys
623 are sequence types, and whose values are arrayrefs to
624 lists of sequence ids within the corresponding sequence type
625 2. if $aln->verbose > 0, ST of each sequence is sent to
626 STDERR (in a tabular format)
627 Argument : None
629 =cut
631 sub uniq_seq {
632 my ($self, $seqid) = @_;
633 my $aln = $self->new;
634 my (%member, %order, @seq, @uniq_str, $st);
635 my $order=0;
636 my $len = $self->length();
637 $st = {};
638 foreach my $seq ( $self->each_seq() ) {
639 my $str = $seq->seq();
641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
642 # comparing two sequence strings
644 # 1st, convert "n", "N" to "?" (for DNA sequence only):
645 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
646 # 2nd, convert leading and ending gaps to "?":
647 $str = &_convert_leading_ending_gaps($str, '-', '?');
648 # Note that '?' also can mean unknown residue.
649 # I don't like making global class member changes like this, too
650 # prone to errors... -- cjfields 08-11-18
651 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
652 my $new = Bio::LocatableSeq->new(
653 -id => $seq->id(),
654 -alphabet=> $seq->alphabet,
655 -seq => $str,
656 -start => $seq->start,
657 -end => $seq->end
659 push @seq, $new;
662 foreach my $seq (@seq) {
663 my $str = $seq->seq();
664 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
665 if ($seen) { # seen before
666 my @memb = @{$member{$key}};
667 push @memb, $seq;
668 $member{$key} = \@memb;
669 } else { # not seen
670 push @uniq_str, $key;
671 $order++;
672 $member{$key} = [ ($seq) ];
673 $order{$key} = $order;
677 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
679 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
681 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
682 my $gap='-';
683 my $end= CORE::length($str2);
684 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
685 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
686 -seq =>$str2,
687 -start=>1,
688 -end =>$end
690 $aln->add_seq($new);
691 foreach (@{$member{$str}}) {
692 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
693 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
696 return wantarray ? ($aln, $st) : $aln;
699 sub _check_uniq { # check if same seq exists in the alignment
700 my ($str1, $ref, $length) = @_;
701 my @char1=split //, $str1;
702 my @array=@$ref;
704 return (0, $str1) if @array==0; # not seen (1st sequence)
706 foreach my $str2 (@array) {
707 my $diff=0;
708 my @char2=split //, $str2;
709 for (my $i=0; $i<=$length-1; $i++) {
710 next if $char1[$i] eq '?';
711 next if $char2[$i] eq '?';
712 $diff++ if $char1[$i] ne $char2[$i];
714 return (1, $str2) if $diff == 0; # seen before
717 return (0, $str1); # not seen
720 sub _convert_leading_ending_gaps {
721 my $s=shift;
722 my $sym1=shift;
723 my $sym2=shift;
724 my @array=split //, $s;
725 # convert leading char:
726 for (my $i=0; $i<=$#array; $i++) {
727 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
729 # convert ending char:
730 for (my $i = $#array; $i>= 0; $i--) {
731 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
733 my $s_new=join '', @array;
734 return $s_new;
737 =head1 Sequence selection methods
739 Methods returning one or more sequences objects.
741 =head2 each_seq
743 Title : each_seq
744 Usage : foreach $seq ( $align->each_seq() )
745 Function : Gets a Seq object from the alignment
746 Returns : Seq object
747 Argument :
749 =cut
751 sub eachSeq {
752 my $self = shift;
753 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
754 $self->each_seq();
757 sub each_seq {
758 my $self = shift;
759 my (@arr,$order);
761 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
762 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
763 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
766 return @arr;
770 =head2 each_alphabetically
772 Title : each_alphabetically
773 Usage : foreach $seq ( $ali->each_alphabetically() )
774 Function : Returns a sequence object, but the objects are returned
775 in alphabetically sorted order.
776 Does not change the order of the alignment.
777 Returns : Seq object
778 Argument :
780 =cut
782 sub each_alphabetically {
783 my $self = shift;
784 my ($seq,$nse,@arr,%hash,$count);
786 foreach $seq ( $self->each_seq() ) {
787 $nse = $seq->get_nse;
788 $hash{$nse} = $seq;
791 foreach $nse ( sort _alpha_startend keys %hash) {
792 push(@arr,$hash{$nse});
794 return @arr;
797 sub _alpha_startend {
798 my ($aname,$astart,$bname,$bstart);
799 ($aname,$astart) = split (/-/,$a);
800 ($bname,$bstart) = split (/-/,$b);
802 if( $aname eq $bname ) {
803 return $astart <=> $bstart;
805 else {
806 return $aname cmp $bname;
810 =head2 each_seq_with_id
812 Title : each_seq_with_id
813 Usage : foreach $seq ( $align->each_seq_with_id() )
814 Function : Gets a Seq objects from the alignment, the contents
815 being those sequences with the given name (there may be
816 more than one)
817 Returns : Seq object
818 Argument : a seq name
820 =cut
822 sub eachSeqWithId {
823 my $self = shift;
824 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
825 $self->each_seq_with_id(@_);
828 sub each_seq_with_id {
829 my $self = shift;
830 my $id = shift;
832 $self->throw("Method each_seq_with_id needs a sequence name argument")
833 unless defined $id;
835 my (@arr, $seq);
837 if (exists($self->{'_start_end_lists'}->{$id})) {
838 @arr = @{$self->{'_start_end_lists'}->{$id}};
840 return @arr;
843 =head2 get_seq_by_pos
845 Title : get_seq_by_pos
846 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
847 Function : Gets a sequence based on its position in the alignment.
848 Numbering starts from 1. Sequence positions larger than
849 num_sequences() will thow an error.
850 Returns : a Bio::LocatableSeq object
851 Args : positive integer for the sequence position
853 =cut
855 sub get_seq_by_pos {
857 my $self = shift;
858 my ($pos) = @_;
860 $self->throw("Sequence position has to be a positive integer, not [$pos]")
861 unless $pos =~ /^\d+$/ and $pos > 0;
862 $self->throw("No sequence at position [$pos]")
863 unless $pos <= $self->num_sequences ;
865 my $nse = $self->{'_order'}->{--$pos};
866 return $self->{'_seq'}->{$nse};
869 =head2 get_seq_by_id
871 Title : get_seq_by_id
872 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
873 Function : Gets a sequence based on its name.
874 Sequences that do not exist will warn and return undef
875 Returns : a Bio::LocatableSeq object
876 Args : string for sequence name
878 =cut
880 sub get_seq_by_id {
881 my ($self,$name) = @_;
882 unless( defined $name ) {
883 $self->warn("Must provide a sequence name");
884 return;
886 for my $seq ( values %{$self->{'_seq'}} ) {
887 if ( $seq->id eq $name) {
888 return $seq;
891 return;
894 =head2 seq_with_features
896 Title : seq_with_features
897 Usage : $seq = $aln->seq_with_features(-pos => 1,
898 -consensus => 60
899 -mask =>
900 sub { my $consensus = shift;
902 for my $i (1..5){
903 my $n = 'N' x $i;
904 my $q = '\?' x $i;
905 while($consensus =~ /[^?]$q[^?]/){
906 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
909 return $consensus;
912 Function: produces a Bio::Seq object by first splicing gaps from -pos
913 (by means of a splice_by_seq_pos() call), then creating
914 features using non-? chars (by means of a consensus_string()
915 call with stringency -consensus).
916 Returns : a Bio::Seq object
917 Args : -pos : required. sequence from which to build the Bio::Seq
918 object
919 -consensus : optional, defaults to consensus_string()'s
920 default cutoff value
921 -mask : optional, a coderef to apply to consensus_string()'s
922 output before building features. this may be useful for
923 closing gaps of 1 bp by masking over them with N, for
924 instance
926 =cut
928 sub seq_with_features{
929 my ($self,%arg) = @_;
931 #first do the preparatory splice
932 $self->throw("must provide a -pos argument") unless $arg{-pos};
933 $self->splice_by_seq_pos($arg{-pos});
935 my $consensus_string = $self->consensus_string($arg{-consensus});
936 $consensus_string = $arg{-mask}->($consensus_string)
937 if defined($arg{-mask});
939 my(@bs,@es);
941 push @bs, 1 if $consensus_string =~ /^[^?]/;
943 while($consensus_string =~ /\?[^?]/g){
944 push @bs, pos($consensus_string);
946 while($consensus_string =~ /[^?]\?/g){
947 push @es, pos($consensus_string);
950 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
952 my $seq = Bio::Seq->new();
954 # my $rootfeature = Bio::SeqFeature::Generic->new(
955 # -source_tag => 'location',
956 # -start => $self->get_seq_by_pos($arg{-pos})->start,
957 # -end => $self->get_seq_by_pos($arg{-pos})->end,
958 # );
959 # $seq->add_SeqFeature($rootfeature);
961 while(my $b = shift @bs){
962 my $e = shift @es;
963 $seq->add_SeqFeature(
964 Bio::SeqFeature::Generic->new(
965 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
966 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
967 -source_tag => $self->source || 'MSA',
972 return $seq;
976 =head1 Create new alignments
978 The result of these methods are horizontal or vertical subsets of the
979 current MSA.
981 =head2 select
983 Title : select
984 Usage : $aln2 = $aln->select(1, 3) # three first sequences
985 Function : Creates a new alignment from a continuous subset of
986 sequences. Numbering starts from 1. Sequence positions
987 larger than num_sequences() will thow an error.
988 Returns : a Bio::SimpleAlign object
989 Args : positive integer for the first sequence
990 positive integer for the last sequence to include (optional)
992 =cut
994 sub select {
995 my $self = shift;
996 my ($start, $end) = @_;
998 $self->throw("Select start has to be a positive integer, not [$start]")
999 unless $start =~ /^\d+$/ and $start > 0;
1000 $self->throw("Select end has to be a positive integer, not [$end]")
1001 unless $end =~ /^\d+$/ and $end > 0;
1002 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003 unless $start <= $end;
1005 my $aln = $self->new;
1006 foreach my $pos ($start .. $end) {
1007 $aln->add_seq($self->get_seq_by_pos($pos));
1009 $aln->id($self->id);
1010 # fix for meta, sf, ann
1011 return $aln;
1014 =head2 select_noncont
1016 Title : select_noncont
1017 Usage : # 1st and 3rd sequences, sorted
1018 $aln2 = $aln->select_noncont(1, 3)
1020 # 1st and 3rd sequences, sorted (same as first)
1021 $aln2 = $aln->select_noncont(3, 1)
1023 # 1st and 3rd sequences, unsorted
1024 $aln2 = $aln->select_noncont('nosort',3, 1)
1026 Function : Creates a new alignment from a subset of sequences. Numbering
1027 starts from 1. Sequence positions larger than num_sequences() will
1028 throw an error. Sorts the order added to new alignment by default,
1029 to prevent sorting pass 'nosort' as the first argument in the list.
1030 Returns : a Bio::SimpleAlign object
1031 Args : array of integers for the sequences. If the string 'nosort' is
1032 passed as the first argument, the sequences will not be sorted
1033 in the new alignment but will appear in the order listed.
1035 =cut
1037 sub select_noncont {
1038 my $self = shift;
1039 my $nosort = 0;
1040 my (@pos) = @_;
1041 if ($pos[0] !~ m{^\d+$}) {
1042 my $sortcmd = shift @pos;
1043 if ($sortcmd eq 'nosort') {
1044 $nosort = 1;
1045 } else {
1046 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1050 my $end = $self->num_sequences;
1051 foreach ( @pos ) {
1052 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1056 @pos = sort {$a <=> $b} @pos unless $nosort;
1058 my $aln = $self->new;
1059 foreach my $p (@pos) {
1060 $aln->add_seq($self->get_seq_by_pos($p));
1062 $aln->id($self->id);
1063 # fix for meta, sf, ann
1064 return $aln;
1067 =head2 slice
1069 Title : slice
1070 Usage : $aln2 = $aln->slice(20,30)
1071 Function : Creates a slice from the alignment inclusive of start and
1072 end columns, and the first column in the alignment is denoted 1.
1073 Sequences with no residues in the slice are excluded from the
1074 new alignment and a warning is printed. Slice beyond the length of
1075 the sequence does not do padding.
1076 Returns : A Bio::SimpleAlign object
1077 Args : Positive integer for start column, positive integer for end column,
1078 optional boolean which if true will keep gap-only columns in the newly
1079 created slice. Example:
1081 $aln2 = $aln->slice(20,30,1)
1083 =cut
1085 sub slice {
1086 my $self = shift;
1087 my ($start, $end, $keep_gap_only) = @_;
1089 $self->throw("Slice start has to be a positive integer, not [$start]")
1090 unless $start =~ /^\d+$/ and $start > 0;
1091 $self->throw("Slice end has to be a positive integer, not [$end]")
1092 unless $end =~ /^\d+$/ and $end > 0;
1093 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1094 unless $start <= $end;
1095 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1096 "[$start] is too big.") if $start > $self->length;
1097 my $cons_meta = $self->consensus_meta;
1098 my $aln = $self->new;
1099 $aln->id($self->id);
1100 foreach my $seq ( $self->each_seq() ) {
1101 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1102 Bio::Seq::Meta->new
1103 (-id => $seq->id,
1104 -alphabet => $seq->alphabet,
1105 -strand => $seq->strand,
1106 -verbose => $self->verbose) :
1107 Bio::LocatableSeq->new
1108 (-id => $seq->id,
1109 -alphabet => $seq->alphabet,
1110 -strand => $seq->strand,
1111 -verbose => $self->verbose);
1113 # seq
1114 my $seq_end = $end;
1115 $seq_end = $seq->length if( $end > $seq->length );
1117 my $slice_seq = $seq->subseq($start, $seq_end);
1118 $new_seq->seq( $slice_seq );
1120 $slice_seq =~ s/\W//g;
1122 if ($start > 1) {
1123 my $pre_start_seq = $seq->subseq(1, $start - 1);
1124 $pre_start_seq =~ s/\W//g;
1125 if (!defined($seq->strand)) {
1126 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1127 } elsif ($seq->strand < 0){
1128 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1129 } else {
1130 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1132 } else {
1133 if ((defined $seq->strand)&&($seq->strand < 0)){
1134 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1135 } else {
1136 $new_seq->start( $seq->start);
1139 if ($new_seq->isa('Bio::Seq::MetaI')) {
1140 for my $meta_name ($seq->meta_names) {
1141 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1144 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1146 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1147 $aln->add_seq($new_seq);
1148 } else {
1149 if( $keep_gap_only ) {
1150 $aln->add_seq($new_seq);
1151 } else {
1152 my $nse = $seq->get_nse();
1153 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1154 " Sequence excluded from the new alignment.");
1158 if ($cons_meta) {
1159 my $new = Bio::Seq::Meta->new();
1160 for my $meta_name ($cons_meta->meta_names) {
1161 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1163 $aln->consensus_meta($new);
1165 $aln->annotation($self->annotation);
1166 # fix for meta, sf, ann
1167 return $aln;
1170 =head2 remove_columns
1172 Title : remove_columns
1173 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1174 $aln2 = $aln->remove_columns([0,0],[6,8])
1175 Function : Creates an aligment with columns removed corresponding to
1176 the specified type or by specifying the columns by number.
1177 Returns : Bio::SimpleAlign object
1178 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1179 'all_gaps_columns') or array ref where the referenced array
1180 contains a pair of integers that specify a range.
1181 The first column is 0
1183 =cut
1185 sub remove_columns {
1186 my ($self,@args) = @_;
1187 @args || $self->throw("Must supply column ranges or column types");
1188 my $aln;
1190 if ($args[0][0] =~ /^[a-z_]+$/i) {
1191 $aln = $self->_remove_columns_by_type($args[0]);
1192 } elsif ($args[0][0] =~ /^\d+$/) {
1193 $aln = $self->_remove_columns_by_num(\@args);
1194 } else {
1195 $self->throw("You must pass array references to remove_columns(), not @args");
1197 # fix for meta, sf, ann
1198 $aln;
1202 =head2 remove_gaps
1204 Title : remove_gaps
1205 Usage : $aln2 = $aln->remove_gaps
1206 Function : Creates an aligment with gaps removed
1207 Returns : a Bio::SimpleAlign object
1208 Args : a gap character(optional) if none specified taken
1209 from $self->gap_char,
1210 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1211 indicates that only all-gaps columns should be deleted
1213 Used from method L<remove_columns> in most cases. Set gap character
1214 using L<gap_char()|gap_char>.
1216 =cut
1218 sub remove_gaps {
1219 my ($self,$gapchar,$all_gaps_columns) = @_;
1220 my $gap_line;
1221 if ($all_gaps_columns) {
1222 $gap_line = $self->all_gap_line($gapchar);
1223 } else {
1224 $gap_line = $self->gap_line($gapchar);
1226 my $aln = $self->new;
1228 my @remove;
1229 my $length = 0;
1230 my $del_char = $gapchar || $self->gap_char;
1231 # Do the matching to get the segments to remove
1232 while ($gap_line =~ m/[$del_char]/g) {
1233 my $start = pos($gap_line)-1;
1234 $gap_line=~/\G[$del_char]+/gc;
1235 my $end = pos($gap_line)-1;
1237 #have to offset the start and end for subsequent removes
1238 $start-=$length;
1239 $end -=$length;
1240 $length += ($end-$start+1);
1241 push @remove, [$start,$end];
1244 #remove the segments
1245 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1246 # fix for meta, sf, ann
1247 return $aln;
1251 sub _remove_col {
1252 my ($self,$aln,$remove) = @_;
1253 my @new;
1255 my $gap = $self->gap_char;
1257 # splice out the segments and create new seq
1258 foreach my $seq($self->each_seq){
1259 my $new_seq = Bio::LocatableSeq->new(
1260 -id => $seq->id,
1261 -alphabet=> $seq->alphabet,
1262 -strand => $seq->strand,
1263 -verbose => $self->verbose);
1264 my $sequence = $seq->seq;
1265 foreach my $pair(@{$remove}){
1266 my $start = $pair->[0];
1267 my $end = $pair->[1];
1268 $sequence = $seq->seq unless $sequence;
1269 my $orig = $sequence;
1270 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1271 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1272 $sequence = $head.$tail;
1273 # start
1274 unless (defined $new_seq->start) {
1275 if ($start == 0) {
1276 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1277 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1279 else {
1280 my $start_adjust = $orig =~ /^$gap+/;
1281 if ($start_adjust) {
1282 $start_adjust = $+[0] == $start;
1284 $new_seq->start($seq->start + $start_adjust);
1287 # end
1288 if (($end + 1) >= CORE::length($orig)) {
1289 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1290 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1292 else {
1293 $new_seq->end($seq->end);
1297 if ($new_seq->end < $new_seq->start) {
1298 # we removed all columns except for gaps: set to 0 to indicate no
1299 # sequence
1300 $new_seq->start(0);
1301 $new_seq->end(0);
1304 $new_seq->seq($sequence) if $sequence;
1305 push @new, $new_seq;
1307 # add the new seqs to the alignment
1308 foreach my $new(@new){
1309 $aln->add_seq($new);
1311 # fix for meta, sf, ann
1312 return $aln;
1315 sub _remove_columns_by_type {
1316 my ($self,$type) = @_;
1317 my $aln = $self->new;
1318 my @remove;
1320 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1321 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1322 my %matchchars = ( 'match' => '\*',
1323 'weak' => '\.',
1324 'strong' => ':',
1325 'mismatch' => ' ',
1326 'gaps' => '',
1327 'all_gaps_columns' => ''
1329 # get the characters to delete against
1330 my $del_char;
1331 foreach my $type (@{$type}){
1332 $del_char.= $matchchars{$type};
1335 my $length = 0;
1336 my $match_line = $self->match_line;
1337 # do the matching to get the segments to remove
1338 if($del_char){
1339 while($match_line =~ m/[$del_char]/g ){
1340 my $start = pos($match_line)-1;
1341 $match_line=~/\G[$del_char]+/gc;
1342 my $end = pos($match_line)-1;
1344 #have to offset the start and end for subsequent removes
1345 $start-=$length;
1346 $end -=$length;
1347 $length += ($end-$start+1);
1348 push @remove, [$start,$end];
1352 # remove the segments
1353 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1354 $aln = $aln->remove_gaps() if $gap;
1355 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1356 # fix for meta, sf, ann
1357 $aln;
1361 sub _remove_columns_by_num {
1362 my ($self,$positions) = @_;
1363 my $aln = $self->new;
1365 # sort the positions
1366 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1368 my @remove;
1369 my $length = 0;
1370 foreach my $pos (@{$positions}) {
1371 my ($start, $end) = @{$pos};
1373 #have to offset the start and end for subsequent removes
1374 $start-=$length;
1375 $end -=$length;
1376 $length += ($end-$start+1);
1377 push @remove, [$start,$end];
1380 #remove the segments
1381 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1382 # fix for meta, sf, ann
1383 $aln;
1387 =head1 Change sequences within the MSA
1389 These methods affect characters in all sequences without changing the
1390 alignment.
1392 =head2 splice_by_seq_pos
1394 Title : splice_by_seq_pos
1395 Usage : $status = splice_by_seq_pos(1);
1396 Function: splices all aligned sequences where the specified sequence
1397 has gaps.
1398 Example :
1399 Returns : 1 on success
1400 Args : position of sequence to splice by
1403 =cut
1405 sub splice_by_seq_pos{
1406 my ($self,$pos) = @_;
1408 my $guide = $self->get_seq_by_pos($pos);
1409 my $guide_seq = $guide->seq;
1411 $guide_seq =~ s/\./\-/g;
1413 my @gaps = ();
1414 $pos = -1;
1415 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1416 unshift @gaps, $pos;
1417 $pos++;
1420 foreach my $seq ($self->each_seq){
1421 my @bases = split '', $seq->seq;
1423 splice(@bases, $_, 1) foreach @gaps;
1424 $seq->seq(join('', @bases));
1430 =head2 map_chars
1432 Title : map_chars
1433 Usage : $ali->map_chars('\.','-')
1434 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1435 characters
1437 Notice that the from (arg1) is interpretted as a regex,
1438 so be careful about quoting meta characters (eg
1439 $ali->map_chars('.','-') wont do what you want)
1440 Returns :
1441 Argument : 'from' rexexp
1442 'to' string
1444 =cut
1446 sub map_chars {
1447 my $self = shift;
1448 my $from = shift;
1449 my $to = shift;
1450 my ($seq,$temp);
1452 $self->throw("Need exactly two arguments")
1453 unless defined $from and defined $to;
1455 foreach $seq ( $self->each_seq() ) {
1456 $temp = $seq->seq();
1457 $temp =~ s/$from/$to/g;
1458 $seq->seq($temp);
1460 return 1;
1464 =head2 uppercase
1466 Title : uppercase()
1467 Usage : $ali->uppercase()
1468 Function : Sets all the sequences to uppercase
1469 Returns :
1470 Argument :
1472 =cut
1474 sub uppercase {
1475 my $self = shift;
1476 my $seq;
1477 my $temp;
1479 foreach $seq ( $self->each_seq() ) {
1480 $temp = $seq->seq();
1481 $temp =~ tr/[a-z]/[A-Z]/;
1483 $seq->seq($temp);
1485 return 1;
1488 =head2 cigar_line
1490 Title : cigar_line()
1491 Usage : %cigars = $align->cigar_line()
1492 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1493 Report) line for each sequence in the alignment. Examples are
1494 "1,60" or "5,10:12,58", where the numbers refer to conserved
1495 positions within the alignment. The keys of the hash are the
1496 NSEs (name/start/end) assigned to each sequence.
1497 Args : threshold (optional, defaults to 100)
1498 Returns : Hash of strings (cigar lines)
1500 =cut
1502 sub cigar_line {
1503 my $self = shift;
1504 my $thr=shift||100;
1505 my %cigars;
1507 my @consensus = split "",($self->consensus_string($thr));
1508 my $len = $self->length;
1509 my $gapchar = $self->gap_char;
1511 # create a precursor, something like (1,4,5,6,7,33,45),
1512 # where each number corresponds to a conserved position
1513 foreach my $seq ( $self->each_seq ) {
1514 my @seq = split "", uc ($seq->seq);
1515 my $pos = 1;
1516 for (my $x = 0 ; $x < $len ; $x++ ) {
1517 if ($seq[$x] eq $consensus[$x]) {
1518 push @{$cigars{$seq->get_nse}},$pos;
1519 $pos++;
1520 } elsif ($seq[$x] ne $gapchar) {
1521 $pos++;
1525 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1526 for my $name (keys %cigars) {
1527 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1528 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1529 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1530 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1531 ${$cigars{$name}}[$#{$cigars{$name}}] );
1532 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1533 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1534 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1535 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1539 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1540 for my $name (keys %cigars) {
1541 my @remove;
1542 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1543 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1544 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1545 unshift @remove,$x;
1548 for my $pos (@remove) {
1549 splice @{$cigars{$name}}, $pos, 1;
1552 # join and punctuate
1553 for my $name (keys %cigars) {
1554 my ($start,$end,$str) = "";
1555 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1556 $str .= ($start . "," . $end . ":");
1558 $str =~ s/:$//;
1559 $cigars{$name} = $str;
1561 %cigars;
1565 =head2 match_line
1567 Title : match_line()
1568 Usage : $line = $align->match_line()
1569 Function : Generates a match line - much like consensus string
1570 except that a line indicating the '*' for a match.
1571 Args : (optional) Match line characters ('*' by default)
1572 (optional) Strong match char (':' by default)
1573 (optional) Weak match char ('.' by default)
1574 Returns : String
1576 =cut
1578 sub match_line {
1579 my ($self,$matchlinechar, $strong, $weak) = @_;
1580 my %matchchars = ('match' => $matchlinechar || '*',
1581 'weak' => $weak || '.',
1582 'strong' => $strong || ':',
1583 'mismatch' => ' ',
1586 my @seqchars;
1587 my $alphabet;
1588 foreach my $seq ( $self->each_seq ) {
1589 push @seqchars, [ split(//, uc ($seq->seq)) ];
1590 $alphabet = $seq->alphabet unless defined $alphabet;
1592 my $refseq = shift @seqchars;
1593 # let's just march down the columns
1594 my $matchline;
1595 POS:
1596 foreach my $pos ( 0..$self->length ) {
1597 my $refchar = $refseq->[$pos];
1598 my $char = $matchchars{'mismatch'};
1599 unless( defined $refchar ) {
1600 last if $pos == $self->length; # short circuit on last residue
1601 # this in place to handle jason's soon-to-be-committed
1602 # intron mapping code
1603 goto bottom;
1605 my %col = ($refchar => 1);
1606 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1607 foreach my $seq ( @seqchars ) {
1608 next if $pos >= scalar @$seq;
1609 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1610 $seq->[$pos] eq ' ' );
1611 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1613 my @colresidues = sort keys %col;
1615 # if all the values are the same
1616 if( $dash ) { $char = $matchchars{'mismatch'} }
1617 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1618 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1619 # matches for protein seqs
1620 TYPE:
1621 foreach my $type ( qw(strong weak) ) {
1622 # iterate through categories
1623 my %groups;
1624 # iterate through each of the aa in the col
1625 # look to see which groups it is in
1626 foreach my $c ( @colresidues ) {
1627 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1628 push @{$groups{$f}},$c;
1631 GRP:
1632 foreach my $cols ( values %groups ) {
1633 @$cols = sort @$cols;
1634 # now we are just testing to see if two arrays
1635 # are identical w/o changing either one
1636 # have to be same len
1637 next if( scalar @$cols != scalar @colresidues );
1638 # walk down the length and check each slot
1639 for($_=0;$_ < (scalar @$cols);$_++ ) {
1640 next GRP if( $cols->[$_] ne $colresidues[$_] );
1642 $char = $matchchars{$type};
1643 last TYPE;
1647 bottom:
1648 $matchline .= $char;
1650 return $matchline;
1654 =head2 gap_line
1656 Title : gap_line()
1657 Usage : $line = $align->gap_line()
1658 Function : Generates a gap line - much like consensus string
1659 except that a line where '-' represents gap
1660 Args : (optional) gap line characters ('-' by default)
1661 Returns : string
1663 =cut
1665 sub gap_line {
1666 my ($self,$gapchar) = @_;
1667 $gapchar = $gapchar || $self->gap_char;
1668 my %gap_hsh; # column gaps vector
1669 foreach my $seq ( $self->each_seq ) {
1670 my $i = 0;
1671 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1672 map {[$i++, $_]} split(//, uc ($seq->seq));
1674 my $gap_line;
1675 foreach my $pos ( 0..$self->length-1 ) {
1676 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1678 return $gap_line;
1681 =head2 all_gap_line
1683 Title : all_gap_line()
1684 Usage : $line = $align->all_gap_line()
1685 Function : Generates a gap line - much like consensus string
1686 except that a line where '-' represents all-gap column
1687 Args : (optional) gap line characters ('-' by default)
1688 Returns : string
1690 =cut
1692 sub all_gap_line {
1693 my ($self,$gapchar) = @_;
1694 $gapchar = $gapchar || $self->gap_char;
1695 my %gap_hsh; # column gaps counter hash
1696 my @seqs = $self->each_seq;
1697 foreach my $seq ( @seqs ) {
1698 my $i = 0;
1699 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1700 map {[$i++, $_]} split(//, uc ($seq->seq));
1702 my $gap_line;
1703 foreach my $pos ( 0..$self->length-1 ) {
1704 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1705 # gaps column
1706 $gap_line .= $gapchar;
1707 } else {
1708 $gap_line .= '.';
1711 return $gap_line;
1714 =head2 gap_col_matrix
1716 Title : gap_col_matrix()
1717 Usage : my $cols = $align->gap_col_matrix()
1718 Function : Generates an array of hashes where
1719 each entry in the array is a hash reference
1720 with keys of all the sequence names and
1721 and value of 1 or 0 if the sequence has a gap at that column
1722 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1724 =cut
1726 sub gap_col_matrix {
1727 my ($self,$gapchar) = @_;
1728 $gapchar = $gapchar || $self->gap_char;
1729 my %gap_hsh; # column gaps vector
1730 my @cols;
1731 foreach my $seq ( $self->each_seq ) {
1732 my $i = 0;
1733 my $str = $seq->seq;
1734 my $len = $seq->length;
1735 my $ch;
1736 my $id = $seq->display_id;
1737 while( $i < $len ) {
1738 $ch = substr($str, $i, 1);
1739 $cols[$i++]->{$id} = ($ch eq $gapchar);
1742 return \@cols;
1745 =head2 match
1747 Title : match()
1748 Usage : $ali->match()
1749 Function : Goes through all columns and changes residues that are
1750 identical to residue in first sequence to match '.'
1751 character. Sets match_char.
1753 USE WITH CARE: Most MSA formats do not support match
1754 characters in sequences, so this is mostly for output
1755 only. NEXUS format (Bio::AlignIO::nexus) can handle
1757 Returns : 1
1758 Argument : a match character, optional, defaults to '.'
1760 =cut
1762 sub match {
1763 my ($self, $match) = @_;
1765 $match ||= '.';
1766 my ($matching_char) = $match;
1767 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1768 $self->map_chars($matching_char, '-');
1770 my @seqs = $self->each_seq();
1771 return 1 unless scalar @seqs > 1;
1773 my $refseq = shift @seqs ;
1774 my @refseq = split //, $refseq->seq;
1775 my $gapchar = $self->gap_char;
1777 foreach my $seq ( @seqs ) {
1778 my @varseq = split //, $seq->seq();
1779 for ( my $i=0; $i < scalar @varseq; $i++) {
1780 $varseq[$i] = $match if defined $refseq[$i] &&
1781 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1782 $refseq[$i] =~ /$gapchar/ )
1783 && $refseq[$i] eq $varseq[$i];
1785 $seq->seq(join '', @varseq);
1787 $self->match_char($match);
1788 return 1;
1792 =head2 unmatch
1794 Title : unmatch()
1795 Usage : $ali->unmatch()
1796 Function : Undoes the effect of method match. Unsets match_char.
1797 Returns : 1
1798 Argument : a match character, optional, defaults to '.'
1800 See L<match> and L<match_char>
1802 =cut
1804 sub unmatch {
1805 my ($self, $match) = @_;
1807 $match ||= '.';
1809 my @seqs = $self->each_seq();
1810 return 1 unless scalar @seqs > 1;
1812 my $refseq = shift @seqs ;
1813 my @refseq = split //, $refseq->seq;
1814 my $gapchar = $self->gap_char;
1815 foreach my $seq ( @seqs ) {
1816 my @varseq = split //, $seq->seq();
1817 for ( my $i=0; $i < scalar @varseq; $i++) {
1818 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1819 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1820 $refseq[$i] =~ /$gapchar/ ) &&
1821 $varseq[$i] eq $match;
1823 $seq->seq(join '', @varseq);
1825 $self->match_char('');
1826 return 1;
1829 =head1 MSA attributes
1831 Methods for setting and reading the MSA attributes.
1833 Note that the methods defining character semantics depend on the user
1834 to set them sensibly. They are needed only by certain input/output
1835 methods. Unset them by setting to an empty string ('').
1837 =head2 id
1839 Title : id
1840 Usage : $myalign->id("Ig")
1841 Function : Gets/sets the id field of the alignment
1842 Returns : An id string
1843 Argument : An id string (optional)
1845 =cut
1847 sub id {
1848 my ($self, $name) = @_;
1850 if (defined( $name )) {
1851 $self->{'_id'} = $name;
1854 return $self->{'_id'};
1857 =head2 accession
1859 Title : accession
1860 Usage : $myalign->accession("PF00244")
1861 Function : Gets/sets the accession field of the alignment
1862 Returns : An acc string
1863 Argument : An acc string (optional)
1865 =cut
1867 sub accession {
1868 my ($self, $acc) = @_;
1870 if (defined( $acc )) {
1871 $self->{'_accession'} = $acc;
1874 return $self->{'_accession'};
1877 =head2 description
1879 Title : description
1880 Usage : $myalign->description("14-3-3 proteins")
1881 Function : Gets/sets the description field of the alignment
1882 Returns : An description string
1883 Argument : An description string (optional)
1885 =cut
1887 sub description {
1888 my ($self, $name) = @_;
1890 if (defined( $name )) {
1891 $self->{'_description'} = $name;
1894 return $self->{'_description'};
1897 =head2 missing_char
1899 Title : missing_char
1900 Usage : $myalign->missing_char("?")
1901 Function : Gets/sets the missing_char attribute of the alignment
1902 It is generally recommended to set it to 'n' or 'N'
1903 for nucleotides and to 'X' for protein.
1904 Returns : An missing_char string,
1905 Argument : An missing_char string (optional)
1907 =cut
1909 sub missing_char {
1910 my ($self, $char) = @_;
1912 if (defined $char ) {
1913 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1914 $self->{'_missing_char'} = $char;
1917 return $self->{'_missing_char'};
1920 =head2 match_char
1922 Title : match_char
1923 Usage : $myalign->match_char('.')
1924 Function : Gets/sets the match_char attribute of the alignment
1925 Returns : An match_char string,
1926 Argument : An match_char string (optional)
1928 =cut
1930 sub match_char {
1931 my ($self, $char) = @_;
1933 if (defined $char ) {
1934 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1935 $self->{'_match_char'} = $char;
1938 return $self->{'_match_char'};
1941 =head2 gap_char
1943 Title : gap_char
1944 Usage : $myalign->gap_char('-')
1945 Function : Gets/sets the gap_char attribute of the alignment
1946 Returns : An gap_char string, defaults to '-'
1947 Argument : An gap_char string (optional)
1949 =cut
1951 sub gap_char {
1952 my ($self, $char) = @_;
1954 if (defined $char || ! defined $self->{'_gap_char'} ) {
1955 $char= '-' unless defined $char;
1956 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1957 $self->{'_gap_char'} = $char;
1959 return $self->{'_gap_char'};
1962 =head2 symbol_chars
1964 Title : symbol_chars
1965 Usage : my @symbolchars = $aln->symbol_chars;
1966 Function: Returns all the seen symbols (other than gaps)
1967 Returns : array of characters that are the seen symbols
1968 Args : boolean to include the gap/missing/match characters
1970 =cut
1972 sub symbol_chars{
1973 my ($self,$includeextra) = @_;
1975 unless ($self->{'_symbols'}) {
1976 foreach my $seq ($self->each_seq) {
1977 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1980 my %copy = %{$self->{'_symbols'}};
1981 if( ! $includeextra ) {
1982 foreach my $char ( $self->gap_char, $self->match_char,
1983 $self->missing_char) {
1984 delete $copy{$char} if( defined $char );
1987 return keys %copy;
1990 =head1 Alignment descriptors
1992 These read only methods describe the MSA in various ways.
1995 =head2 score
1997 Title : score
1998 Usage : $str = $ali->score()
1999 Function : get/set a score of the alignment
2000 Returns : a score for the alignment
2001 Argument : an optional score to set
2003 =cut
2005 sub score {
2006 my $self = shift;
2007 $self->{score} = shift if @_;
2008 return $self->{score};
2011 =head2 consensus_string
2013 Title : consensus_string
2014 Usage : $str = $ali->consensus_string($threshold_percent)
2015 Function : Makes a strict consensus
2016 Returns : Consensus string
2017 Argument : Optional threshold ranging from 0 to 100.
2018 The consensus residue has to appear at least threshold %
2019 of the sequences at a given location, otherwise a '?'
2020 character will be placed at that location.
2021 (Default value = 0%)
2023 =cut
2025 sub consensus_string {
2026 my $self = shift;
2027 my $threshold = shift;
2029 my $out = "";
2030 my $len = $self->length - 1;
2032 foreach ( 0 .. $len ) {
2033 $out .= $self->_consensus_aa($_,$threshold);
2035 return $out;
2038 =head2 consensus_conservation
2040 Title : consensus_conservation
2041 Usage : @conservation = $ali->consensus_conservation();
2042 Function : Conservation (as a percent) of each position of alignment
2043 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2044 Argument :
2046 =cut
2048 sub consensus_conservation {
2049 my $self = shift;
2050 my @cons;
2051 my $num_sequences = $self->num_sequences;
2052 foreach my $point (0..$self->length-1) {
2053 my %hash = $self->_consensus_counts($point);
2054 # max frequency of a non-gap letter
2055 my $max = (sort {$b<=>$a} values %hash )[0];
2056 push @cons, 100 * $max / $num_sequences;
2058 return @cons;
2061 sub _consensus_aa {
2062 my $self = shift;
2063 my $point = shift;
2064 my $threshold_percent = shift || -1 ;
2065 my ($seq,%hash,$count,$letter,$key);
2066 my $gapchar = $self->gap_char;
2067 %hash = $self->_consensus_counts($point);
2068 my $number_of_sequences = $self->num_sequences();
2069 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2070 $count = -1;
2071 $letter = '?';
2073 foreach $key ( sort keys %hash ) {
2074 # print "Now at $key $hash{$key}\n";
2075 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2076 $letter = $key;
2077 $count = $hash{$key};
2080 return $letter;
2083 # Frequency of each letter in one column
2084 sub _consensus_counts {
2085 my $self = shift;
2086 my $point = shift;
2087 my %hash;
2088 my $gapchar = $self->gap_char;
2089 foreach my $seq ( $self->each_seq() ) {
2090 my $letter = substr($seq->seq,$point,1);
2091 $self->throw("--$point-----------") if $letter eq '';
2092 ($letter eq $gapchar || $letter =~ /\./) && next;
2093 $hash{$letter}++;
2095 return %hash;
2099 =head2 consensus_iupac
2101 Title : consensus_iupac
2102 Usage : $str = $ali->consensus_iupac()
2103 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2104 and RNA. The output is in upper case except when gaps in
2105 a column force output to be in lower case.
2107 Note that if your alignment sequences contain a lot of
2108 IUPAC ambiquity codes you often have to manually set
2109 alphabet. Bio::PrimarySeq::_guess_type thinks they
2110 indicate a protein sequence.
2111 Returns : consensus string
2112 Argument : none
2113 Throws : on protein sequences
2115 =cut
2117 sub consensus_iupac {
2118 my $self = shift;
2119 my $out = "";
2120 my $len = $self->length-1;
2122 # only DNA and RNA sequences are valid
2123 foreach my $seq ( $self->each_seq() ) {
2124 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2125 if $seq->alphabet eq 'protein';
2127 # loop over the alignment columns
2128 foreach my $count ( 0 .. $len ) {
2129 $out .= $self->_consensus_iupac($count);
2131 return $out;
2134 sub _consensus_iupac {
2135 my ($self, $column) = @_;
2136 my ($string, $char, $rna);
2138 #determine all residues in a column
2139 foreach my $seq ( $self->each_seq() ) {
2140 $string .= substr($seq->seq, $column, 1);
2142 $string = uc $string;
2144 # quick exit if there's an N in the string
2145 if ($string =~ /N/) {
2146 $string =~ /\W/ ? return 'n' : return 'N';
2148 # ... or if there are only gap characters
2149 return '-' if $string =~ /^\W+$/;
2151 # treat RNA as DNA in regexps
2152 if ($string =~ /U/) {
2153 $string =~ s/U/T/;
2154 $rna = 1;
2157 # the following s///'s only need to be done to the _first_ ambiguity code
2158 # as we only need to see the _range_ of characters in $string
2160 if ($string =~ /[VDHB]/) {
2161 $string =~ s/V/AGC/;
2162 $string =~ s/D/AGT/;
2163 $string =~ s/H/ACT/;
2164 $string =~ s/B/CTG/;
2167 if ($string =~ /[SKYRWM]/) {
2168 $string =~ s/S/GC/;
2169 $string =~ s/K/GT/;
2170 $string =~ s/Y/CT/;
2171 $string =~ s/R/AG/;
2172 $string =~ s/W/AT/;
2173 $string =~ s/M/AC/;
2176 # and now the guts of the thing
2178 if ($string =~ /A/) {
2179 $char = 'A'; # A A
2180 if ($string =~ /G/) {
2181 $char = 'R'; # A and G (purines) R
2182 if ($string =~ /C/) {
2183 $char = 'V'; # A and G and C V
2184 if ($string =~ /T/) {
2185 $char = 'N'; # A and G and C and T N
2187 } elsif ($string =~ /T/) {
2188 $char = 'D'; # A and G and T D
2190 } elsif ($string =~ /C/) {
2191 $char = 'M'; # A and C M
2192 if ($string =~ /T/) {
2193 $char = 'H'; # A and C and T H
2195 } elsif ($string =~ /T/) {
2196 $char = 'W'; # A and T W
2198 } elsif ($string =~ /C/) {
2199 $char = 'C'; # C C
2200 if ($string =~ /T/) {
2201 $char = 'Y'; # C and T (pyrimidines) Y
2202 if ($string =~ /G/) {
2203 $char = 'B'; # C and T and G B
2205 } elsif ($string =~ /G/) {
2206 $char = 'S'; # C and G S
2208 } elsif ($string =~ /G/) {
2209 $char = 'G'; # G G
2210 if ($string =~ /C/) {
2211 $char = 'S'; # G and C S
2212 } elsif ($string =~ /T/) {
2213 $char = 'K'; # G and T K
2215 } elsif ($string =~ /T/) {
2216 $char = 'T'; # T T
2219 $char = 'U' if $rna and $char eq 'T';
2220 $char = lc $char if $string =~ /\W/;
2222 return $char;
2226 =head2 consensus_meta
2228 Title : consensus_meta
2229 Usage : $seqmeta = $ali->consensus_meta()
2230 Function : Returns a Bio::Seq::Meta object containing the consensus
2231 strings derived from meta data analysis.
2232 Returns : Bio::Seq::Meta
2233 Argument : Bio::Seq::Meta
2234 Throws : non-MetaI object
2236 =cut
2238 sub consensus_meta {
2239 my ($self, $meta) = @_;
2240 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2241 $self->throw('Not a Bio::Seq::MetaI object');
2243 return $self->{'_aln_meta'} = $meta if $meta;
2244 return $self->{'_aln_meta'}
2247 =head2 is_flush
2249 Title : is_flush
2250 Usage : if ( $ali->is_flush() )
2251 Function : Tells you whether the alignment
2252 : is flush, i.e. all of the same length
2253 Returns : 1 or 0
2254 Argument :
2256 =cut
2258 sub is_flush {
2259 my ($self,$report) = @_;
2260 my $seq;
2261 my $length = (-1);
2262 my $temp;
2264 foreach $seq ( $self->each_seq() ) {
2265 if( $length == (-1) ) {
2266 $length = CORE::length($seq->seq());
2267 next;
2270 $temp = CORE::length($seq->seq());
2271 if( $temp != $length ) {
2272 $self->warn("expecting $length not $temp from ".
2273 $seq->display_id) if( $report );
2274 $self->debug("expecting $length not $temp from ".
2275 $seq->display_id);
2276 $self->debug($seq->seq(). "\n");
2277 return 0;
2281 return 1;
2285 =head2 length
2287 Title : length()
2288 Usage : $len = $ali->length()
2289 Function : Returns the maximum length of the alignment.
2290 To be sure the alignment is a block, use is_flush
2291 Returns : Integer
2292 Argument :
2294 =cut
2296 sub length_aln {
2297 my $self = shift;
2298 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2299 $self->length(@_);
2302 sub length {
2303 my $self = shift;
2304 my $seq;
2305 my $length = -1;
2306 my $temp;
2308 foreach $seq ( $self->each_seq() ) {
2309 $temp = $seq->length();
2310 if( $temp > $length ) {
2311 $length = $temp;
2315 return $length;
2319 =head2 maxdisplayname_length
2321 Title : maxdisplayname_length
2322 Usage : $ali->maxdisplayname_length()
2323 Function : Gets the maximum length of the displayname in the
2324 alignment. Used in writing out various MSA formats.
2325 Returns : integer
2326 Argument :
2328 =cut
2330 sub maxname_length {
2331 my $self = shift;
2332 $self->deprecated("maxname_length - deprecated method.".
2333 " Use maxdisplayname_length() instead.");
2334 $self->maxdisplayname_length();
2337 sub maxnse_length {
2338 my $self = shift;
2339 $self->deprecated("maxnse_length - deprecated method.".
2340 " Use maxnse_length() instead.");
2341 $self->maxdisplayname_length();
2344 sub maxdisplayname_length {
2345 my $self = shift;
2346 my $maxname = (-1);
2347 my ($seq,$len);
2349 foreach $seq ( $self->each_seq() ) {
2350 $len = CORE::length $self->displayname($seq->get_nse());
2352 if( $len > $maxname ) {
2353 $maxname = $len;
2357 return $maxname;
2360 =head2 max_metaname_length
2362 Title : max_metaname_length
2363 Usage : $ali->max_metaname_length()
2364 Function : Gets the maximum length of the meta name tags in the
2365 alignment for the sequences and for the alignment.
2366 Used in writing out various MSA formats.
2367 Returns : integer
2368 Argument : None
2370 =cut
2372 sub max_metaname_length {
2373 my $self = shift;
2374 my $maxname = (-1);
2375 my ($seq,$len);
2377 # check seq meta first
2378 for $seq ( $self->each_seq() ) {
2379 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2380 for my $mtag ($seq->meta_names) {
2381 $len = CORE::length $mtag;
2382 if( $len > $maxname ) {
2383 $maxname = $len;
2388 # alignment meta
2389 for my $meta ($self->consensus_meta) {
2390 next unless $meta;
2391 for my $name ($meta->meta_names) {
2392 $len = CORE::length $name;
2393 if( $len > $maxname ) {
2394 $maxname = $len;
2399 return $maxname;
2402 =head2 num_residues
2404 Title : num_residues
2405 Usage : $no = $ali->num_residues
2406 Function : number of residues in total in the alignment
2407 Returns : integer
2408 Argument :
2409 Note : replaces no_residues()
2411 =cut
2413 sub num_residues {
2414 my $self = shift;
2415 my $count = 0;
2417 foreach my $seq ($self->each_seq) {
2418 my $str = $seq->seq();
2420 $count += ($str =~ s/[A-Za-z]//g);
2423 return $count;
2426 =head2 num_sequences
2428 Title : num_sequences
2429 Usage : $depth = $ali->num_sequences
2430 Function : number of sequence in the sequence alignment
2431 Returns : integer
2432 Argument : none
2433 Note : replaces no_sequences()
2435 =cut
2437 sub num_sequences {
2438 my $self = shift;
2439 return scalar($self->each_seq);
2443 =head2 average_percentage_identity
2445 Title : average_percentage_identity
2446 Usage : $id = $align->average_percentage_identity
2447 Function: The function uses a fast method to calculate the average
2448 percentage identity of the alignment
2449 Returns : The average percentage identity of the alignment
2450 Args : None
2451 Notes : This method implemented by Kevin Howe calculates a figure that is
2452 designed to be similar to the average pairwise identity of the
2453 alignment (identical in the absence of gaps), without having to
2454 explicitly calculate pairwise identities proposed by Richard Durbin.
2455 Validated by Ewan Birney ad Alex Bateman.
2457 =cut
2459 sub average_percentage_identity{
2460 my ($self,@args) = @_;
2462 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2463 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2465 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2467 if (! $self->is_flush()) {
2468 $self->throw("All sequences in the alignment must be the same length");
2471 @seqs = $self->each_seq();
2472 $len = $self->length();
2474 # load the each hash with correct keys for existence checks
2476 for( my $index=0; $index < $len; $index++) {
2477 foreach my $letter (@alphabet) {
2478 $countHashes[$index]->{$letter} = 0;
2481 foreach my $seq (@seqs) {
2482 my @seqChars = split //, $seq->seq();
2483 for( my $column=0; $column < @seqChars; $column++ ) {
2484 my $char = uc($seqChars[$column]);
2485 if (exists $countHashes[$column]->{$char}) {
2486 $countHashes[$column]->{$char}++;
2491 $total = 0;
2492 $divisor = 0;
2493 for(my $column =0; $column < $len; $column++) {
2494 my %hash = %{$countHashes[$column]};
2495 $subdivisor = 0;
2496 foreach my $res (keys %hash) {
2497 $total += $hash{$res}*($hash{$res} - 1);
2498 $subdivisor += $hash{$res};
2500 $divisor += $subdivisor * ($subdivisor - 1);
2502 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2505 =head2 percentage_identity
2507 Title : percentage_identity
2508 Usage : $id = $align->percentage_identity
2509 Function: The function calculates the average percentage identity
2510 (aliased to average_percentage_identity)
2511 Returns : The average percentage identity
2512 Args : None
2514 =cut
2516 sub percentage_identity {
2517 my $self = shift;
2518 return $self->average_percentage_identity();
2521 =head2 overall_percentage_identity
2523 Title : overall_percentage_identity
2524 Usage : $id = $align->overall_percentage_identity
2525 $id = $align->overall_percentage_identity('short')
2526 Function: The function calculates the percentage identity of
2527 the conserved columns
2528 Returns : The percentage identity of the conserved columns
2529 Args : length value to use, optional defaults to alignment length
2530 possible values: 'align', 'short', 'long'
2532 The argument values 'short' and 'long' refer to shortest and longest
2533 sequence in the alignment. Method modification code by Hongyu Zhang.
2535 =cut
2537 sub overall_percentage_identity{
2538 my ($self, $length_measure) = @_;
2540 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2542 my %enum = map {$_ => undef} qw (align short long);
2544 $self->throw("Unknown argument [$length_measure]")
2545 if $length_measure and not exists $enum{$length_measure};
2546 $length_measure ||= 'align';
2548 if (! $self->is_flush()) {
2549 $self->throw("All sequences in the alignment must be the same length");
2552 # Count the residues seen at each position
2553 my $len;
2554 my $total = 0; # number of positions with identical residues
2555 my @countHashes;
2556 my @seqs = $self->each_seq;
2557 my $nof_seqs = scalar @seqs;
2558 my $aln_len = $self->length();
2559 for my $seq (@seqs) {
2560 my $seqstr = $seq->seq;
2562 # Count residues for given sequence
2563 for my $column (0 .. $aln_len-1) {
2564 my $char = uc( substr($seqstr, $column, 1) );
2565 if ( exists $alphabet{$char} ) {
2567 # This is a valid char
2568 if ( defined $countHashes[$column]->{$char} ) {
2569 $countHashes[$column]->{$char}++;
2570 } else {
2571 $countHashes[$column]->{$char} = 1;
2574 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2575 # All sequences have this same residue
2576 $total++;
2582 # Sequence length
2583 if ($length_measure eq 'short' || $length_measure eq 'long') {
2584 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2585 if ($length_measure eq 'short') {
2586 if ( (not defined $len) || ($seq_len < $len) ) {
2587 $len = $seq_len;
2589 } elsif ($length_measure eq 'long') {
2590 if ( (not defined $len) || ($seq_len > $len) ) {
2591 $len = $seq_len;
2598 if ($length_measure eq 'align') {
2599 $len = $aln_len;
2602 return ($total / $len ) * 100.0;
2607 =head1 Alignment positions
2609 Methods to map a sequence position into an alignment column and back.
2610 column_from_residue_number() does the former. The latter is really a
2611 property of the sequence object and can done using
2612 L<Bio::LocatableSeq::location_from_column>:
2614 # select somehow a sequence from the alignment, e.g.
2615 my $seq = $aln->get_seq_by_pos(1);
2616 #$loc is undef or Bio::LocationI object
2617 my $loc = $seq->location_from_column(5);
2619 =head2 column_from_residue_number
2621 Title : column_from_residue_number
2622 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2623 Function: This function gives the position in the alignment
2624 (i.e. column number) of the given residue number in the
2625 sequence with the given name. For example, for the
2626 alignment
2628 Seq1/91-97 AC..DEF.GH.
2629 Seq2/24-30 ACGG.RTY...
2630 Seq3/43-51 AC.DDEF.GHI
2632 column_from_residue_number( "Seq1", 94 ) returns 6.
2633 column_from_residue_number( "Seq2", 25 ) returns 2.
2634 column_from_residue_number( "Seq3", 50 ) returns 10.
2636 An exception is thrown if the residue number would lie
2637 outside the length of the aligment
2638 (e.g. column_from_residue_number( "Seq2", 22 )
2640 Note: If the the parent sequence is represented by more than
2641 one alignment sequence and the residue number is present in
2642 them, this method finds only the first one.
2644 Returns : A column number for the position in the alignment of the
2645 given residue in the given sequence (1 = first column)
2646 Args : A sequence id/name (not a name/start-end)
2647 A residue number in the whole sequence (not just that
2648 segment of it in the alignment)
2650 =cut
2652 sub column_from_residue_number {
2653 my ($self, $name, $resnumber) = @_;
2655 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2656 $self->throw("Second argument residue number missing") unless $resnumber;
2658 foreach my $seq ($self->each_seq_with_id($name)) {
2659 my $col;
2660 eval {
2661 $col = $seq->column_from_residue_number($resnumber);
2663 next if $@;
2664 return $col;
2667 $self->throw("Could not find a sequence segment in $name ".
2668 "containing residue number $resnumber");
2672 =head1 Sequence names
2674 Methods to manipulate the display name. The default name based on the
2675 sequence id and subsequence positions can be overridden in various
2676 ways.
2678 =head2 displayname
2680 Title : displayname
2681 Usage : $myalign->displayname("Ig", "IgA")
2682 Function : Gets/sets the display name of a sequence in the alignment
2683 Returns : A display name string
2684 Argument : name of the sequence
2685 displayname of the sequence (optional)
2687 =cut
2689 sub get_displayname {
2690 my $self = shift;
2691 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2692 $self->displayname(@_);
2695 sub set_displayname {
2696 my $self = shift;
2697 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2698 $self->displayname(@_);
2701 sub displayname {
2702 my ($self, $name, $disname) = @_;
2704 $self->throw("No sequence with name [$name]")
2705 unless defined $self->{'_seq'}->{$name};
2707 if( $disname and $name) {
2708 $self->{'_dis_name'}->{$name} = $disname;
2709 return $disname;
2711 elsif( defined $self->{'_dis_name'}->{$name} ) {
2712 return $self->{'_dis_name'}->{$name};
2713 } else {
2714 return $name;
2718 =head2 set_displayname_count
2720 Title : set_displayname_count
2721 Usage : $ali->set_displayname_count
2722 Function : Sets the names to be name_# where # is the number of
2723 times this name has been used.
2724 Returns : 1, on success
2725 Argument :
2727 =cut
2729 sub set_displayname_count {
2730 my $self= shift;
2731 my (@arr,$name,$seq,$count,$temp,$nse);
2733 foreach $seq ( $self->each_alphabetically() ) {
2734 $nse = $seq->get_nse();
2736 #name will be set when this is the second
2737 #time (or greater) is has been seen
2739 if( defined $name and $name eq ($seq->id()) ) {
2740 $temp = sprintf("%s_%s",$name,$count);
2741 $self->displayname($nse,$temp);
2742 $count++;
2743 } else {
2744 $count = 1;
2745 $name = $seq->id();
2746 $temp = sprintf("%s_%s",$name,$count);
2747 $self->displayname($nse,$temp);
2748 $count++;
2751 return 1;
2754 =head2 set_displayname_flat
2756 Title : set_displayname_flat
2757 Usage : $ali->set_displayname_flat()
2758 Function : Makes all the sequences be displayed as just their name,
2759 not name/start-end
2760 Returns : 1
2761 Argument :
2763 =cut
2765 sub set_displayname_flat {
2766 my $self = shift;
2767 my ($nse,$seq);
2769 foreach $seq ( $self->each_seq() ) {
2770 $nse = $seq->get_nse();
2771 $self->displayname($nse,$seq->id());
2773 return 1;
2776 =head2 set_displayname_normal
2778 Title : set_displayname_normal
2779 Usage : $ali->set_displayname_normal()
2780 Function : Makes all the sequences be displayed as name/start-end
2781 Returns : 1, on success
2782 Argument :
2784 =cut
2786 sub set_displayname_normal {
2787 my $self = shift;
2788 my ($nse,$seq);
2790 foreach $seq ( $self->each_seq() ) {
2791 $nse = $seq->get_nse();
2792 $self->displayname($nse,$nse);
2794 return 1;
2797 =head2 source
2799 Title : source
2800 Usage : $obj->source($newval)
2801 Function: sets the Alignment source program
2802 Example :
2803 Returns : value of source
2804 Args : newvalue (optional)
2807 =cut
2809 sub source{
2810 my ($self,$value) = @_;
2811 if( defined $value) {
2812 $self->{'_source'} = $value;
2814 return $self->{'_source'};
2817 =head2 set_displayname_safe
2819 Title : set_displayname_safe
2820 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2821 Function : Assign machine-generated serial names to sequences in input order.
2822 Designed to protect names during PHYLIP runs. Assign 10-char string
2823 in the form of "S000000001" to "S999999999". Restore the original
2824 names using "restore_displayname".
2825 Returns : 1. a new $aln with system names;
2826 2. a hash ref for restoring names
2827 Argument : Number for id length (default 10)
2829 =cut
2831 sub set_displayname_safe {
2832 my $self = shift;
2833 my $idlength = shift || 10;
2834 my ($seq, %phylip_name);
2835 my $ct=0;
2836 my $new=Bio::SimpleAlign->new();
2837 foreach $seq ( $self->each_seq() ) {
2838 $ct++;
2839 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2840 $phylip_name{$pname}=$seq->id();
2841 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2842 -seq => $seq->seq(),
2843 -alphabet => $seq->alphabet,
2844 -start => $seq->{_start},
2845 -end => $seq->{_end}
2847 $new->add_seq($new_seq);
2850 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2851 return ($new, \%phylip_name);
2854 =head2 restore_displayname
2856 Title : restore_displayname
2857 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2858 Function : Restore original sequence names (after running
2859 $ali->set_displayname_safe)
2860 Returns : a new $aln with names restored.
2861 Argument : a hash reference of names from "set_displayname_safe".
2863 =cut
2865 sub restore_displayname {
2866 my $self = shift;
2867 my $ref=shift;
2868 my %name=%$ref;
2869 my $new=Bio::SimpleAlign->new();
2870 foreach my $seq ( $self->each_seq() ) {
2871 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2872 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2873 -seq => $seq->seq(),
2874 -alphabet => $seq->alphabet,
2875 -start => $seq->{_start},
2876 -end => $seq->{_end}
2878 $new->add_seq($new_seq);
2880 return $new;
2883 =head2 sort_by_start
2885 Title : sort_by_start
2886 Usage : $ali->sort_by_start
2887 Function : Changes the order of the alignment to the start position of each
2888 subalignment
2889 Returns :
2890 Argument :
2892 =cut
2894 sub sort_by_start {
2895 my $self = shift;
2896 my ($seq,$nse,@arr,%hash,$count);
2897 foreach $seq ( $self->each_seq() ) {
2898 $nse = $seq->get_nse;
2899 $hash{$nse} = $seq;
2901 $count = 0;
2902 %{$self->{'_order'}} = (); # reset the hash;
2903 foreach $nse ( sort _startend keys %hash) {
2904 $self->{'_order'}->{$count} = $nse;
2905 $count++;
2910 sub _startend
2912 my ($aname,$arange) = split (/[\/]/,$a);
2913 my ($bname,$brange) = split (/[\/]/,$b);
2914 my ($astart,$aend) = split(/\-/,$arange);
2915 my ($bstart,$bend) = split(/\-/,$brange);
2916 return $astart <=> $bstart;
2919 =head2 bracket_string
2921 Title : bracket_string
2922 Usage : my @params = (-refseq => 'testseq',
2923 -allele1 => 'allele1',
2924 -allele2 => 'allele2',
2925 -delimiters => '{}',
2926 -separator => '/');
2927 $str = $aln->bracket_string(@params)
2929 Function : When supplied with a list of parameters (see below), returns a
2930 string in BIC format. This is used for allelic comparisons.
2931 Briefly, if either allele contains a base change when compared to
2932 the refseq, the base or gap for each allele is represented in
2933 brackets in the order present in the 'alleles' parameter.
2935 For the following data:
2937 >testseq
2938 GGATCCATTGCTACT
2939 >allele1
2940 GGATCCATTCCTACT
2941 >allele2
2942 GGAT--ATTCCTCCT
2944 the returned string with parameters 'refseq => testseq' and
2945 'alleles => [qw(allele1 allele2)]' would be:
2947 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2948 Returns : BIC-formatted string
2949 Argument : Required args
2950 refseq : string (ID) of the reference sequence used
2951 as basis for comparison
2952 allele1 : string (ID) of the first allele
2953 allele2 : string (ID) of the second allele
2954 Optional args
2955 delimiters: two symbol string of left and right delimiters.
2956 Only the first two symbols are used
2957 default = '[]'
2958 separator : string used as a separator. Only the first
2959 symbol is used
2960 default = '/'
2961 Throws : On no refseq/alleles, or invalid refseq/alleles.
2963 =cut
2965 sub bracket_string {
2966 my ($self, @args) = @_;
2967 my ($ref, $a1, $a2, $delim, $sep) =
2968 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2969 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2970 my ($ld, $rd);
2971 ($ld, $rd) = split('', $delim, 2) if $delim;
2972 $ld ||= '[';
2973 $rd ||= ']';
2974 $sep ||= '/';
2975 my ($refseq, $allele1, $allele2) =
2976 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2977 if (!$refseq || !$allele1 || !$allele2) {
2978 $self->throw("One of your refseq/allele IDs is invalid!");
2980 my $len = $self->length-1;
2981 my $bic = '';
2982 # loop over the alignment columns
2983 for my $column ( 0 .. $len ) {
2984 my $string;
2985 my ($compres, $res1, $res2) =
2986 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2987 # are any of the allele symbols different from the refseq?
2988 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2989 $ld.$res1.$sep.$res2.$rd;
2990 $bic .= $string;
2992 return $bic;
2996 =head2 methods implementing Bio::FeatureHolderI
2998 FeatureHolderI implementation to support labeled character sets like one
2999 would get from NEXUS represented data.
3001 =head2 get_SeqFeatures
3003 Usage : @features = $aln->get_SeqFeatures
3004 Function: Get the feature objects held by this feature holder.
3005 Example :
3006 Returns : an array of Bio::SeqFeatureI implementing objects
3007 Args : optional filter coderef, taking a Bio::SeqFeatureI
3008 : as argument, returning TRUE if wanted, FALSE if
3009 : unwanted
3011 =cut
3013 sub get_SeqFeatures {
3014 my $self = shift;
3015 my $filter_cb = shift;
3016 $self->throw("Arg (filter callback) must be a coderef") unless
3017 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
3018 if( !defined $self->{'_as_feat'} ) {
3019 $self->{'_as_feat'} = [];
3021 if ($filter_cb) {
3022 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
3024 return @{$self->{'_as_feat'}};
3027 =head2 add_SeqFeature
3029 Usage : $aln->add_SeqFeature($subfeat);
3030 Function: adds a SeqFeature into the SeqFeature array.
3031 Example :
3032 Returns : true on success
3033 Args : a Bio::SeqFeatureI object
3034 Note : This implementation is not compliant
3035 with Bio::FeatureHolderI
3037 =cut
3039 sub add_SeqFeature {
3040 my ($self,@feat) = @_;
3042 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3044 foreach my $feat ( @feat ) {
3045 if( !$feat->isa("Bio::SeqFeatureI") ) {
3046 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3049 push(@{$self->{'_as_feat'}},$feat);
3051 return 1;
3055 =head2 remove_SeqFeatures
3057 Usage : $obj->remove_SeqFeatures
3058 Function: Removes all SeqFeatures. If you want to remove only a subset,
3059 remove that subset from the returned array, and add back the rest.
3060 Returns : The array of Bio::SeqFeatureI features that was
3061 deleted from this alignment.
3062 Args : none
3064 =cut
3066 sub remove_SeqFeatures {
3067 my $self = shift;
3069 return () unless $self->{'_as_feat'};
3070 my @feats = @{$self->{'_as_feat'}};
3071 $self->{'_as_feat'} = [];
3072 return @feats;
3075 =head2 feature_count
3077 Title : feature_count
3078 Usage : $obj->feature_count()
3079 Function: Return the number of SeqFeatures attached to the alignment
3080 Returns : integer representing the number of SeqFeatures
3081 Args : None
3083 =cut
3085 sub feature_count {
3086 my ($self) = @_;
3088 if (defined($self->{'_as_feat'})) {
3089 return ($#{$self->{'_as_feat'}} + 1);
3090 } else {
3091 return 0;
3095 =head2 get_all_SeqFeatures
3097 Title : get_all_SeqFeatures
3098 Usage :
3099 Function: Get all SeqFeatures.
3100 Example :
3101 Returns : an array of Bio::SeqFeatureI implementing objects
3102 Args : none
3103 Note : Falls through to Bio::FeatureHolderI implementation.
3105 =cut
3107 =head2 methods for Bio::AnnotatableI
3109 AnnotatableI implementation to support sequence alignments which
3110 contain annotation (NEXUS, Stockholm).
3112 =head2 annotation
3114 Title : annotation
3115 Usage : $ann = $aln->annotation or
3116 $aln->annotation($ann)
3117 Function: Gets or sets the annotation
3118 Returns : Bio::AnnotationCollectionI object
3119 Args : None or Bio::AnnotationCollectionI object
3121 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3122 for more information
3124 =cut
3126 sub annotation {
3127 my ($obj,$value) = @_;
3128 if( defined $value ) {
3129 $obj->throw("object of class ".ref($value)." does not implement ".
3130 "Bio::AnnotationCollectionI. Too bad.")
3131 unless $value->isa("Bio::AnnotationCollectionI");
3132 $obj->{'_annotation'} = $value;
3133 } elsif( ! defined $obj->{'_annotation'}) {
3134 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3136 return $obj->{'_annotation'};
3139 =head1 Deprecated methods
3141 =cut
3143 =head2 no_residues
3145 Title : no_residues
3146 Usage : $no = $ali->no_residues
3147 Function : number of residues in total in the alignment
3148 Returns : integer
3149 Argument :
3150 Note : deprecated in favor of num_residues()
3152 =cut
3154 sub no_residues {
3155 my $self = shift;
3156 $self->deprecated(-warn_version => 1.0069,
3157 -throw_version => 1.0075,
3158 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3159 $self->num_residues(@_);
3162 =head2 no_sequences
3164 Title : no_sequences
3165 Usage : $depth = $ali->no_sequences
3166 Function : number of sequence in the sequence alignment
3167 Returns : integer
3168 Argument :
3169 Note : deprecated in favor of num_sequences()
3171 =cut
3173 sub no_sequences {
3174 my $self = shift;
3175 $self->deprecated(-warn_version => 1.0069,
3176 -throw_version => 1.0075,
3177 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3178 $self->num_sequences(@_);
3181 =head2 mask_columns
3183 Title : mask_columns
3184 Usage : $aln2 = $aln->mask_columns(20,30)
3185 Function : Masks a slice of the alignment inclusive of start and
3186 end columns, and the first column in the alignment is denoted 1.
3187 Mask beyond the length of the sequence does not do padding.
3188 Returns : A Bio::SimpleAlign object
3189 Args : Positive integer for start column, positive integer for end column,
3190 optional string value use for the mask. Example:
3192 $aln2 = $aln->mask_columns(20,30,'?')
3193 Note : Masking must use a character that is not used for gaps or
3194 frameshifts. These can be adjusted using the relevant global
3195 variables, but be aware these may be (uncontrollably) modified
3196 elsewhere within BioPerl (see bug 2715)
3198 =cut
3200 sub mask_columns {
3201 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3202 my $self = shift;
3204 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3205 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3207 # coordinates are alignment-based, not sequence-based
3208 my ($start, $end, $mask_char) = @_;
3209 unless (defined $mask_char) { $mask_char = 'N' }
3211 $self->throw("Mask start has to be a positive integer and less than ".
3212 "alignment length, not [$start]")
3213 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3214 $self->throw("Mask end has to be a positive integer and less than ".
3215 "alignment length, not [$end]")
3216 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3217 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3218 "end [$end]") unless $start <= $end;
3219 $self->throw("Mask character $mask_char has to be a single character ".
3220 "and not a gap or frameshift symbol")
3221 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3223 my $aln = $self->new;
3224 $aln->id($self->id);
3225 foreach my $seq ( $self->each_seq() ) {
3226 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3227 -alphabet => $seq->alphabet,
3228 -strand => $seq->strand,
3229 -verbose => $self->verbose);
3231 # convert from 1-based alignment coords!
3232 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3233 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3234 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3235 $new_seq->seq($new_dna_string);
3236 $aln->add_seq($new_seq);
3238 return $aln;