don't match strings that wrap lines, use fuzzy match instead
[bioperl-live.git] / Bio / SimpleAlign.pm
blob72701db3c783565ffaedd663ff1f7ce82d4ca0ce
1 # BioPerl module for SimpleAlign
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 # History:
14 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
15 # May 2001 major rewrite - Heikki Lehvaslaiho
17 =head1 NAME
19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
21 =head1 SYNOPSIS
23 # Use Bio::AlignIO to read in the alignment
24 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
25 $aln = $str->next_aln();
27 # Describe
28 print $aln->length;
29 print $aln->num_residues;
30 print $aln->is_flush;
31 print $aln->num_sequences;
32 print $aln->score;
33 print $aln->percentage_identity;
34 print $aln->consensus_string(50);
36 # Find the position in the alignment for a sequence location
37 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
39 # Extract sequences and check values for the alignment column $pos
40 foreach $seq ($aln->each_seq) {
41 $res = $seq->subseq($pos, $pos);
42 $count{$res}++;
44 foreach $res (keys %count) {
45 printf "Res: %s Count: %2d\n", $res, $count{$res};
48 # Manipulate
49 $aln->remove_seq($seq);
50 $mini_aln = $aln->slice(20,30); # get a block of columns
51 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
52 $new_aln = $aln->remove_columns([20,30]); # remove by position
53 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
55 # Analyze
56 $str = $aln->consensus_string($threshold_percent);
57 $str = $aln->match_line();
58 $str = $aln->cigar_line();
59 $id = $aln->percentage_identity;
61 # See the module documentation for details and more methods.
63 =head1 DESCRIPTION
65 SimpleAlign is an object that handles a multiple sequence alignment
66 (MSA). It is very permissive of types (it does not insist on sequences
67 being all same length, for example). Think of it as a set of sequences
68 with a whole series of built-in manipulations and methods for reading and
69 writing alignments.
71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
72 to store its sequences. These are subsequences with a start and end
73 positions in the parent reference sequence. Each sequence in the
74 SimpleAlign object is a Bio::LocatableSeq.
76 SimpleAlign expects the combination of name, start, and end for a
77 given sequence to be unique in the alignment, and this is the key for the
78 internal hashes (name, start, end are abbreviated C<nse> in the code).
79 However, in some cases people do not want the name/start-end to be displayed:
80 either multiple names in an alignment or names specific to the alignment
81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
82 C<displayname>, and generally is what is used to print out the
83 alignment. They default to name/start-end.
85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Support
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
113 web:
115 https://redmine.open-bio.org/projects/bioperl/
117 =head1 AUTHOR
119 Ewan Birney, birney@ebi.ac.uk
121 =head1 CONTRIBUTORS
123 Allen Day, allenday-at-ucla.edu,
124 Richard Adams, Richard.Adams-at-ed.ac.uk,
125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
127 Allen Smith, allens-at-cpan.org,
128 Jason Stajich, jason-at-bioperl.org,
129 Anthony Underwood, aunderwood-at-phls.org.uk,
130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
131 Brian Osborne, bosborne at alum.mit.edu
132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
133 Hongyu Zhang, forward at hongyu.org
134 Jay Hannah, jay at jays.net
135 Alexandr Bezginov, albezg at gmail.com
137 =head1 SEE ALSO
139 L<Bio::LocatableSeq>
141 =head1 APPENDIX
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
146 =cut
148 # 'Let the code begin...
150 package Bio::SimpleAlign;
151 use vars qw(%CONSERVATION_GROUPS);
152 use strict;
154 use Bio::LocatableSeq; # uses Seq's as list
156 use Bio::Seq;
157 use Bio::SeqFeature::Generic;
159 BEGIN {
160 # This data should probably be in a more centralized module...
161 # it is taken from Clustalw documentation.
162 # These are all the positively scoring groups that occur in the
163 # Gonnet Pam250 matrix. The strong and weak groups are
164 # defined as strong score >0.5 and weak score =<0.5 respectively.
166 %CONSERVATION_GROUPS = (
167 'strong' => [ qw(
169 NEQK
170 NHQK
171 NDEQ
172 QHRK
173 MILV
174 MILF
176 FYW )],
177 'weak' => [ qw(
181 STNK
182 STPA
183 SGND
184 SNDEQK
185 NDEQHK
186 NEQHRK
187 FVLIM
188 HFY )],);
191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
192 Bio::FeatureHolderI);
194 =head2 new
196 Title : new
197 Usage : my $aln = Bio::SimpleAlign->new();
198 Function : Creates a new simple align object
199 Returns : Bio::SimpleAlign
200 Args : -source => string representing the source program
201 where this alignment came from
202 -annotation => Bio::AnnotationCollectionI
203 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
204 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
205 -consensus => consensus string
206 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
208 =cut
211 sub new {
212 my($class,@args) = @_;
214 my $self = $class->SUPER::new(@args);
216 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
217 SOURCE
218 SCORE
220 ACCESSION
221 DESCRIPTION
222 SEQS
223 FEATURES
224 ANNOTATION
225 SEQ_ANNOTATION
226 CONSENSUS
227 CONSENSUS_META
228 )], @args);
229 $src && $self->source($src);
230 defined $score && $self->score($score);
231 # we need to set up internal hashs first!
233 $self->{'_seq'} = {};
234 $self->{'_order'} = {};
235 $self->{'_start_end_lists'} = {};
236 $self->{'_dis_name'} = {};
237 $self->{'_id'} = 'NoName';
238 # maybe we should automatically read in from args. Hmmm...
239 $id && $self->id($id);
240 $acc && $self->accession($acc);
241 $desc && $self->description($desc);
242 $coll && $self->annotation($coll);
243 # sequence annotation is layered into a provided annotation collection (or dies)
244 if ($sa) {
245 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
246 "with a sequence annotation collection")
247 if !$coll;
248 $coll->add_Annotation('seq_annotation', $sa);
250 if ($feats && ref $feats eq 'ARRAY') {
251 for my $feat (@$feats) {
252 $self->add_SeqFeature($feat);
255 $con && $self->consensus($con);
256 $cmeta && $self->consensus_meta($cmeta);
257 # assumes these are in correct alignment order
258 if ($seqs && ref($seqs) eq 'ARRAY') {
259 for my $seq (@$seqs) {
260 $self->add_seq($seq);
264 return $self; # success - we hope!
267 =head1 Modifier methods
269 These methods modify the MSA by adding, removing or shuffling complete
270 sequences.
272 =head2 add_seq
274 Title : add_seq
275 Usage : $myalign->add_seq($newseq);
276 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
277 Function : Adds another sequence to the alignment. *Does not* align
278 it - just adds it to the hashes.
279 If -ORDER is specified, the sequence is inserted at the
280 the position spec'd by -ORDER, and existing sequences
281 are pushed down the storage array.
282 Returns : nothing
283 Args : A Bio::LocatableSeq object
284 Positive integer for the sequence position (optional)
286 See L<Bio::LocatableSeq> for more information
288 =cut
290 sub addSeq {
291 my $self = shift;
292 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
293 $self->add_seq(@_);
296 sub add_seq {
297 my $self = shift;
298 my @args = @_;
299 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
300 my ($name,$id,$start,$end);
302 unless ($seq) {
303 $self->throw("LocatableSeq argument required");
305 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
306 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
308 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
309 $order--; # jay's patch (user-specified order is 1-origin)
311 if ($order < 0) {
312 $self->throw("User-specified value for ORDER must be >= 1");
315 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
317 # build the symbol list for this sequence,
318 # will prune out the gap and missing/match chars
319 # when actually asked for the symbol list in the
320 # symbol_chars
321 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
323 $name = $seq->get_nse;
325 if( $self->{'_seq'}->{$name} ) {
326 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
328 else {
329 $self->debug( "Assigning $name to $order\n");
331 my $ordh = $self->{'_order'};
332 if ($ordh->{$order}) {
333 # make space to insert
334 # $c->() returns (in reverse order) the first subsequence
335 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
336 # (3,2,1), and $c->(2,4,5) returns (2).
337 my $c;
338 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
339 map {
340 $ordh->{$_+1} = $ordh->{$_}
341 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
344 $ordh->{$order} = $name;
346 unless( exists( $self->{'_start_end_lists'}->{$id})) {
347 $self->{'_start_end_lists'}->{$id} = [];
349 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
352 $self->{'_seq'}->{$name} = $seq;
357 =head2 remove_seq
359 Title : remove_seq
360 Usage : $aln->remove_seq($seq);
361 Function : Removes a single sequence from an alignment
362 Returns :
363 Argument : a Bio::LocatableSeq object
365 =cut
367 sub removeSeq {
368 my $self = shift;
369 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
370 $self->remove_seq(@_);
373 sub remove_seq {
374 my $self = shift;
375 my $seq = shift;
376 my ($name,$id);
378 $self->throw("Need Bio::Locatable seq argument ")
379 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
381 $id = $seq->id();
382 $name = $seq->get_nse;
384 if( !exists $self->{'_seq'}->{$name} ) {
385 $self->throw("Sequence $name does not exist in the alignment to remove!");
388 delete $self->{'_seq'}->{$name};
390 # we need to remove this seq from the start_end_lists hash
392 if (exists $self->{'_start_end_lists'}->{$id}) {
393 # we need to find the sequence in the array.
395 my ($i, $found);;
396 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
397 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
398 $found = 1;
399 last;
402 if ($found) {
403 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
405 else {
406 $self->throw("Could not find the sequence to remoce from the start-end list");
409 else {
410 $self->throw("There is no seq list for the name $id");
412 # we need to shift order hash
413 my %rev_order = reverse %{$self->{'_order'}};
414 my $no = $rev_order{$name};
415 my $num_sequences = $self->num_sequences;
416 for (; $no < $num_sequences; $no++) {
417 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
419 delete $self->{'_order'}->{$no};
420 return 1;
424 =head2 purge
426 Title : purge
427 Usage : $aln->purge(0.7);
428 Function: Removes sequences above given sequence similarity
429 This function will grind on large alignments. Beware!
430 Example :
431 Returns : An array of the removed sequences
432 Args : float, threshold for similarity
434 =cut
436 sub purge {
437 my ($self,$perc) = @_;
438 my (%duplicate, @dups);
440 my @seqs = $self->each_seq();
442 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
443 my $seq = $seqs[$i];
445 #skip if already in duplicate hash
446 next if exists $duplicate{$seq->display_id} ;
447 my $one = $seq->seq();
449 my @one = split '', $one; #split to get 1aa per array element
451 for (my $j=$i+1;$j < @seqs;$j++) {
452 my $seq2 = $seqs[$j];
454 #skip if already in duplicate hash
455 next if exists $duplicate{$seq2->display_id} ;
457 my $two = $seq2->seq();
458 my @two = split '', $two;
460 my $count = 0;
461 my $res = 0;
462 for (my $k=0;$k<@one;$k++) {
463 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
464 $one[$k] eq $two[$k]) {
465 $count++;
467 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
468 $two[$k] ne '.' && $two[$k] ne '-' ) {
469 $res++;
473 my $ratio = 0;
474 $ratio = $count/$res unless $res == 0;
476 # if above threshold put in duplicate hash and push onto
477 # duplicate array for returning to get_unique
478 if ( $ratio > $perc ) {
479 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
480 $duplicate{$seq2->display_id} = 1;
481 push @dups, $seq2;
485 foreach my $seq (@dups) {
486 $self->remove_seq($seq);
488 return @dups;
491 =head2 sort_alphabetically
493 Title : sort_alphabetically
494 Usage : $ali->sort_alphabetically
495 Function : Changes the order of the alignment to alphabetical on name
496 followed by numerical by number.
497 Returns :
498 Argument :
500 =cut
502 sub sort_alphabetically {
503 my $self = shift;
504 my ($seq,$nse,@arr,%hash,$count);
506 foreach $seq ( $self->each_seq() ) {
507 $nse = $seq->get_nse;
508 $hash{$nse} = $seq;
511 $count = 0;
513 %{$self->{'_order'}} = (); # reset the hash;
515 foreach $nse ( sort _alpha_startend keys %hash) {
516 $self->{'_order'}->{$count} = $nse;
518 $count++;
523 =head2 sort_by_list
525 Title : sort_by_list
526 Usage : $aln_ordered=$aln->sort_by_list($list_file)
527 Function : Arbitrarily order sequences in an alignment
528 Returns : A new Bio::SimpleAlign object
529 Argument : a file listing sequence names in intended order (one name per line)
531 =cut
533 sub sort_by_list {
534 my ($self, $list) = @_;
535 my (@seq, @ids, %order);
537 foreach my $seq ( $self->each_seq() ) {
538 push @seq, $seq;
539 push @ids, $seq->display_id;
542 my $ct=1;
543 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
544 while (<$listfh>) {
545 chomp;
546 my $name=$_;
547 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
548 $order{$name}=$ct++;
550 close($listfh);
552 # use the map-sort-map idiom:
553 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
554 my $aln = $self->new;
555 foreach (@sorted) { $aln->add_seq($_) }
556 return $aln;
559 =head2 set_new_reference
561 Title : set_new_reference
562 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
563 the sequence whoes name is "B31" (full, exact, and case-sensitive),
564 as the reference (1st) sequence
565 Function : Change/Set a new reference (i.e., the first) sequence
566 Returns : a new Bio::SimpleAlign object.
567 Throws an exception if designated sequence not found
568 Argument : a positive integer of sequence order, or a sequence name
569 in the original alignment
571 =cut
573 sub set_new_reference {
574 my ($self, $seqid) = @_;
575 my $aln = $self->new;
576 my (@seq, @ids, @new_seq);
577 my $is_num=0;
578 foreach my $seq ( $self->each_seq() ) {
579 push @seq, $seq;
580 push @ids, $seq->display_id;
583 if ($seqid =~ /^\d+$/) { # argument is seq position
584 $is_num=1;
585 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
586 } else { # argument is a seq name
587 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
590 for (my $i=0; $i<=$#seq; $i++) {
591 my $pos=$i+1;
592 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
593 unshift @new_seq, $seq[$i];
594 } else {
595 push @new_seq, $seq[$i];
598 foreach (@new_seq) { $aln->add_seq($_); }
599 return $aln;
602 sub _in_aln { # check if input name exists in the alignment
603 my ($str, $ref) = @_;
604 foreach (@$ref) {
605 return 1 if $str eq $_;
607 return 0;
611 =head2 uniq_seq
613 Title : uniq_seq
614 Usage : $aln->uniq_seq(): Remove identical sequences in
615 in the alignment. Ambiguous base ("N", "n") and
616 leading and ending gaps ("-") are NOT counted as
617 differences.
618 Function : Make a new alignment of unique sequence types (STs)
619 Returns : 1a. if called in a scalar context,
620 a new Bio::SimpleAlign object (all sequences renamed as "ST")
621 1b. if called in an array context,
622 a new Bio::SimpleAlign object, and a hashref whose keys
623 are sequence types, and whose values are arrayrefs to
624 lists of sequence ids within the corresponding sequence type
625 2. if $aln->verbose > 0, ST of each sequence is sent to
626 STDERR (in a tabular format)
627 Argument : None
629 =cut
631 sub uniq_seq {
632 my ($self, $seqid) = @_;
633 my $aln = $self->new;
634 my (%member, %order, @seq, @uniq_str, $st);
635 my $order=0;
636 my $len = $self->length();
637 $st = {};
638 foreach my $seq ( $self->each_seq() ) {
639 my $str = $seq->seq();
641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
642 # comparing two sequence strings
644 # 1st, convert "n", "N" to "?" (for DNA sequence only):
645 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
646 # 2nd, convert leading and ending gaps to "?":
647 $str = &_convert_leading_ending_gaps($str, '-', '?');
648 # Note that '?' also can mean unknown residue.
649 # I don't like making global class member changes like this, too
650 # prone to errors... -- cjfields 08-11-18
651 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
652 my $new = Bio::LocatableSeq->new(
653 -id => $seq->id(),
654 -alphabet=> $seq->alphabet,
655 -seq => $str,
656 -start => $seq->start,
657 -end => $seq->end
659 push @seq, $new;
662 foreach my $seq (@seq) {
663 my $str = $seq->seq();
664 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
665 if ($seen) { # seen before
666 my @memb = @{$member{$key}};
667 push @memb, $seq;
668 $member{$key} = \@memb;
669 } else { # not seen
670 push @uniq_str, $key;
671 $order++;
672 $member{$key} = [ ($seq) ];
673 $order{$key} = $order;
677 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
679 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
681 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
682 my $gap='-';
683 my $end= CORE::length($str2);
684 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
685 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
686 -seq =>$str2,
687 -start=>1,
688 -end =>$end
690 $aln->add_seq($new);
691 foreach (@{$member{$str}}) {
692 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
693 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
696 return wantarray ? ($aln, $st) : $aln;
699 sub _check_uniq { # check if same seq exists in the alignment
700 my ($str1, $ref, $length) = @_;
701 my @char1=split //, $str1;
702 my @array=@$ref;
704 return (0, $str1) if @array==0; # not seen (1st sequence)
706 foreach my $str2 (@array) {
707 my $diff=0;
708 my @char2=split //, $str2;
709 for (my $i=0; $i<=$length-1; $i++) {
710 next if $char1[$i] eq '?';
711 next if $char2[$i] eq '?';
712 $diff++ if $char1[$i] ne $char2[$i];
714 return (1, $str2) if $diff == 0; # seen before
717 return (0, $str1); # not seen
720 sub _convert_leading_ending_gaps {
721 my $s=shift;
722 my $sym1=shift;
723 my $sym2=shift;
724 my @array=split //, $s;
725 # convert leading char:
726 for (my $i=0; $i<=$#array; $i++) {
727 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
729 # convert ending char:
730 for (my $i = $#array; $i>= 0; $i--) {
731 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
733 my $s_new=join '', @array;
734 return $s_new;
737 =head1 Sequence selection methods
739 Methods returning one or more sequences objects.
741 =head2 each_seq
743 Title : each_seq
744 Usage : foreach $seq ( $align->each_seq() )
745 Function : Gets a Seq object from the alignment
746 Returns : Seq object
747 Argument :
749 =cut
751 sub eachSeq {
752 my $self = shift;
753 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
754 $self->each_seq();
757 sub each_seq {
758 my $self = shift;
759 my (@arr,$order);
761 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
762 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
763 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
766 return @arr;
770 =head2 each_alphabetically
772 Title : each_alphabetically
773 Usage : foreach $seq ( $ali->each_alphabetically() )
774 Function : Returns a sequence object, but the objects are returned
775 in alphabetically sorted order.
776 Does not change the order of the alignment.
777 Returns : Seq object
778 Argument :
780 =cut
782 sub each_alphabetically {
783 my $self = shift;
784 my ($seq,$nse,@arr,%hash,$count);
786 foreach $seq ( $self->each_seq() ) {
787 $nse = $seq->get_nse;
788 $hash{$nse} = $seq;
791 foreach $nse ( sort _alpha_startend keys %hash) {
792 push(@arr,$hash{$nse});
794 return @arr;
797 sub _alpha_startend {
798 my ($aname,$astart,$bname,$bstart);
799 ($aname,$astart) = split (/-/,$a);
800 ($bname,$bstart) = split (/-/,$b);
802 if( $aname eq $bname ) {
803 return $astart <=> $bstart;
805 else {
806 return $aname cmp $bname;
810 =head2 each_seq_with_id
812 Title : each_seq_with_id
813 Usage : foreach $seq ( $align->each_seq_with_id() )
814 Function : Gets a Seq objects from the alignment, the contents
815 being those sequences with the given name (there may be
816 more than one)
817 Returns : Seq object
818 Argument : a seq name
820 =cut
822 sub eachSeqWithId {
823 my $self = shift;
824 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
825 $self->each_seq_with_id(@_);
828 sub each_seq_with_id {
829 my $self = shift;
830 my $id = shift;
832 $self->throw("Method each_seq_with_id needs a sequence name argument")
833 unless defined $id;
835 my (@arr, $seq);
837 if (exists($self->{'_start_end_lists'}->{$id})) {
838 @arr = @{$self->{'_start_end_lists'}->{$id}};
840 return @arr;
843 =head2 get_seq_by_pos
845 Title : get_seq_by_pos
846 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
847 Function : Gets a sequence based on its position in the alignment.
848 Numbering starts from 1. Sequence positions larger than
849 num_sequences() will thow an error.
850 Returns : a Bio::LocatableSeq object
851 Args : positive integer for the sequence position
853 =cut
855 sub get_seq_by_pos {
857 my $self = shift;
858 my ($pos) = @_;
860 $self->throw("Sequence position has to be a positive integer, not [$pos]")
861 unless $pos =~ /^\d+$/ and $pos > 0;
862 $self->throw("No sequence at position [$pos]")
863 unless $pos <= $self->num_sequences ;
865 my $nse = $self->{'_order'}->{--$pos};
866 return $self->{'_seq'}->{$nse};
869 =head2 get_seq_by_id
871 Title : get_seq_by_id
872 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
873 Function : Gets a sequence based on its name.
874 Sequences that do not exist will warn and return undef
875 Returns : a Bio::LocatableSeq object
876 Args : string for sequence name
878 =cut
880 sub get_seq_by_id {
881 my ($self,$name) = @_;
882 unless( defined $name ) {
883 $self->warn("Must provide a sequence name");
884 return;
886 for my $seq ( values %{$self->{'_seq'}} ) {
887 if ( $seq->id eq $name) {
888 return $seq;
891 return;
894 =head2 seq_with_features
896 Title : seq_with_features
897 Usage : $seq = $aln->seq_with_features(-pos => 1,
898 -consensus => 60
899 -mask =>
900 sub { my $consensus = shift;
902 for my $i (1..5){
903 my $n = 'N' x $i;
904 my $q = '\?' x $i;
905 while($consensus =~ /[^?]$q[^?]/){
906 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
909 return $consensus;
912 Function: produces a Bio::Seq object by first splicing gaps from -pos
913 (by means of a splice_by_seq_pos() call), then creating
914 features using non-? chars (by means of a consensus_string()
915 call with stringency -consensus).
916 Returns : a Bio::Seq object
917 Args : -pos : required. sequence from which to build the Bio::Seq
918 object
919 -consensus : optional, defaults to consensus_string()'s
920 default cutoff value
921 -mask : optional, a coderef to apply to consensus_string()'s
922 output before building features. this may be useful for
923 closing gaps of 1 bp by masking over them with N, for
924 instance
926 =cut
928 sub seq_with_features{
929 my ($self,%arg) = @_;
931 #first do the preparatory splice
932 $self->throw("must provide a -pos argument") unless $arg{-pos};
933 $self->splice_by_seq_pos($arg{-pos});
935 my $consensus_string = $self->consensus_string($arg{-consensus});
936 $consensus_string = $arg{-mask}->($consensus_string)
937 if defined($arg{-mask});
939 my(@bs,@es);
941 push @bs, 1 if $consensus_string =~ /^[^?]/;
943 while($consensus_string =~ /\?[^?]/g){
944 push @bs, pos($consensus_string);
946 while($consensus_string =~ /[^?]\?/g){
947 push @es, pos($consensus_string);
950 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
952 my $seq = Bio::Seq->new();
954 # my $rootfeature = Bio::SeqFeature::Generic->new(
955 # -source_tag => 'location',
956 # -start => $self->get_seq_by_pos($arg{-pos})->start,
957 # -end => $self->get_seq_by_pos($arg{-pos})->end,
958 # );
959 # $seq->add_SeqFeature($rootfeature);
961 while(my $b = shift @bs){
962 my $e = shift @es;
963 $seq->add_SeqFeature(
964 Bio::SeqFeature::Generic->new(
965 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
966 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
967 -source_tag => $self->source || 'MSA',
972 return $seq;
976 =head1 Create new alignments
978 The result of these methods are horizontal or vertical subsets of the
979 current MSA.
981 =head2 select
983 Title : select
984 Usage : $aln2 = $aln->select(1, 3) # three first sequences
985 Function : Creates a new alignment from a continuous subset of
986 sequences. Numbering starts from 1. Sequence positions
987 larger than num_sequences() will thow an error.
988 Returns : a Bio::SimpleAlign object
989 Args : positive integer for the first sequence
990 positive integer for the last sequence to include (optional)
992 =cut
994 sub select {
995 my $self = shift;
996 my ($start, $end) = @_;
998 $self->throw("Select start has to be a positive integer, not [$start]")
999 unless $start =~ /^\d+$/ and $start > 0;
1000 $self->throw("Select end has to be a positive integer, not [$end]")
1001 unless $end =~ /^\d+$/ and $end > 0;
1002 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003 unless $start <= $end;
1005 my $aln = $self->new;
1006 foreach my $pos ($start .. $end) {
1007 $aln->add_seq($self->get_seq_by_pos($pos));
1009 $aln->id($self->id);
1010 # fix for meta, sf, ann
1011 return $aln;
1014 =head2 select_noncont
1016 Title : select_noncont
1017 Usage : # 1st and 3rd sequences, sorted
1018 $aln2 = $aln->select_noncont(1, 3)
1020 # 1st and 3rd sequences, sorted (same as first)
1021 $aln2 = $aln->select_noncont(3, 1)
1023 # 1st and 3rd sequences, unsorted
1024 $aln2 = $aln->select_noncont('nosort',3, 1)
1026 Function : Creates a new alignment from a subset of sequences. Numbering
1027 starts from 1. Sequence positions larger than num_sequences() will
1028 throw an error. Sorts the order added to new alignment by default,
1029 to prevent sorting pass 'nosort' as the first argument in the list.
1030 Returns : a Bio::SimpleAlign object
1031 Args : array of integers for the sequences. If the string 'nosort' is
1032 passed as the first argument, the sequences will not be sorted
1033 in the new alignment but will appear in the order listed.
1035 =cut
1037 sub select_noncont {
1038 my $self = shift;
1039 my $nosort = 0;
1040 my (@pos) = @_;
1041 if ($pos[0] !~ m{^\d+$}) {
1042 my $sortcmd = shift @pos;
1043 if ($sortcmd eq 'nosort') {
1044 $nosort = 1;
1045 } else {
1046 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1050 my $end = $self->num_sequences;
1051 foreach ( @pos ) {
1052 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1056 @pos = sort {$a <=> $b} @pos unless $nosort;
1058 my $aln = $self->new;
1059 foreach my $p (@pos) {
1060 $aln->add_seq($self->get_seq_by_pos($p));
1062 $aln->id($self->id);
1063 # fix for meta, sf, ann
1064 return $aln;
1067 =head2 select_noncont_by_name
1069 Title : select_noncont_by_name
1070 Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1071 Function : Creates a new alignment from a subset of sequences which are
1072 selected by name (sequence ID).
1073 Returns : a Bio::SimpleAlign object
1074 Args : array of names (i.e., identifiers) for the sequences.
1076 =cut
1078 sub select_noncont_by_name {
1079 my ($self, @names) = @_;
1081 my $aln = $self->new;
1082 foreach my $name (@names) {
1083 $aln->add_seq($self->get_seq_by_id($name));
1085 $aln->id($self->id);
1087 return $aln;
1090 =head2 slice
1092 Title : slice
1093 Usage : $aln2 = $aln->slice(20,30)
1094 Function : Creates a slice from the alignment inclusive of start and
1095 end columns, and the first column in the alignment is denoted 1.
1096 Sequences with no residues in the slice are excluded from the
1097 new alignment and a warning is printed. Slice beyond the length of
1098 the sequence does not do padding.
1099 Returns : A Bio::SimpleAlign object
1100 Args : Positive integer for start column, positive integer for end column,
1101 optional boolean which if true will keep gap-only columns in the newly
1102 created slice. Example:
1104 $aln2 = $aln->slice(20,30,1)
1106 =cut
1108 sub slice {
1109 my $self = shift;
1110 my ($start, $end, $keep_gap_only) = @_;
1112 $self->throw("Slice start has to be a positive integer, not [$start]")
1113 unless $start =~ /^\d+$/ and $start > 0;
1114 $self->throw("Slice end has to be a positive integer, not [$end]")
1115 unless $end =~ /^\d+$/ and $end > 0;
1116 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1117 unless $start <= $end;
1118 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1119 "[$start] is too big.") if $start > $self->length;
1120 my $cons_meta = $self->consensus_meta;
1121 my $aln = $self->new;
1122 $aln->id($self->id);
1123 foreach my $seq ( $self->each_seq() ) {
1124 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1125 Bio::Seq::Meta->new
1126 (-id => $seq->id,
1127 -alphabet => $seq->alphabet,
1128 -strand => $seq->strand,
1129 -verbose => $self->verbose) :
1130 Bio::LocatableSeq->new
1131 (-id => $seq->id,
1132 -alphabet => $seq->alphabet,
1133 -strand => $seq->strand,
1134 -verbose => $self->verbose);
1136 # seq
1137 my $seq_end = $end;
1138 $seq_end = $seq->length if( $end > $seq->length );
1140 my $slice_seq = $seq->subseq($start, $seq_end);
1141 $new_seq->seq( $slice_seq );
1143 $slice_seq =~ s/\W//g;
1145 if ($start > 1) {
1146 my $pre_start_seq = $seq->subseq(1, $start - 1);
1147 $pre_start_seq =~ s/\W//g;
1148 if (!defined($seq->strand)) {
1149 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1150 } elsif ($seq->strand < 0){
1151 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1152 } else {
1153 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1155 } else {
1156 if ((defined $seq->strand)&&($seq->strand < 0)){
1157 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1158 } else {
1159 $new_seq->start( $seq->start);
1162 if ($new_seq->isa('Bio::Seq::MetaI')) {
1163 for my $meta_name ($seq->meta_names) {
1164 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1167 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1169 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1170 $aln->add_seq($new_seq);
1171 } else {
1172 if( $keep_gap_only ) {
1173 $aln->add_seq($new_seq);
1174 } else {
1175 my $nse = $seq->get_nse();
1176 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1177 " Sequence excluded from the new alignment.");
1181 if ($cons_meta) {
1182 my $new = Bio::Seq::Meta->new();
1183 for my $meta_name ($cons_meta->meta_names) {
1184 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1186 $aln->consensus_meta($new);
1188 $aln->annotation($self->annotation);
1189 # fix for meta, sf, ann
1190 return $aln;
1193 =head2 remove_columns
1195 Title : remove_columns
1196 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1197 $aln2 = $aln->remove_columns([0,0],[6,8])
1198 Function : Creates an aligment with columns removed corresponding to
1199 the specified type or by specifying the columns by number.
1200 Returns : Bio::SimpleAlign object
1201 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1202 'all_gaps_columns') or array ref where the referenced array
1203 contains a pair of integers that specify a range.
1204 The first column is 0
1206 =cut
1208 sub remove_columns {
1209 my ($self,@args) = @_;
1210 @args || $self->throw("Must supply column ranges or column types");
1211 my $aln;
1213 if ($args[0][0] =~ /^[a-z_]+$/i) {
1214 $aln = $self->_remove_columns_by_type($args[0]);
1215 } elsif ($args[0][0] =~ /^\d+$/) {
1216 $aln = $self->_remove_columns_by_num(\@args);
1217 } else {
1218 $self->throw("You must pass array references to remove_columns(), not @args");
1220 # fix for meta, sf, ann
1221 $aln;
1225 =head2 remove_gaps
1227 Title : remove_gaps
1228 Usage : $aln2 = $aln->remove_gaps
1229 Function : Creates an aligment with gaps removed
1230 Returns : a Bio::SimpleAlign object
1231 Args : a gap character(optional) if none specified taken
1232 from $self->gap_char,
1233 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1234 indicates that only all-gaps columns should be deleted
1236 Used from method L<remove_columns> in most cases. Set gap character
1237 using L<gap_char()|gap_char>.
1239 =cut
1241 sub remove_gaps {
1242 my ($self,$gapchar,$all_gaps_columns) = @_;
1243 my $gap_line;
1244 if ($all_gaps_columns) {
1245 $gap_line = $self->all_gap_line($gapchar);
1246 } else {
1247 $gap_line = $self->gap_line($gapchar);
1249 my $aln = $self->new;
1251 my @remove;
1252 my $length = 0;
1253 my $del_char = $gapchar || $self->gap_char;
1254 # Do the matching to get the segments to remove
1255 while ($gap_line =~ m/[$del_char]/g) {
1256 my $start = pos($gap_line)-1;
1257 $gap_line=~/\G[$del_char]+/gc;
1258 my $end = pos($gap_line)-1;
1260 #have to offset the start and end for subsequent removes
1261 $start-=$length;
1262 $end -=$length;
1263 $length += ($end-$start+1);
1264 push @remove, [$start,$end];
1267 #remove the segments
1268 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1269 # fix for meta, sf, ann
1270 return $aln;
1274 sub _remove_col {
1275 my ($self,$aln,$remove) = @_;
1276 my @new;
1278 my $gap = $self->gap_char;
1280 # splice out the segments and create new seq
1281 foreach my $seq($self->each_seq){
1282 my $new_seq = Bio::LocatableSeq->new(
1283 -id => $seq->id,
1284 -alphabet=> $seq->alphabet,
1285 -strand => $seq->strand,
1286 -verbose => $self->verbose);
1287 my $sequence = $seq->seq;
1288 foreach my $pair(@{$remove}){
1289 my $start = $pair->[0];
1290 my $end = $pair->[1];
1291 $sequence = $seq->seq unless $sequence;
1292 my $orig = $sequence;
1293 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1294 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1295 $sequence = $head.$tail;
1296 # start
1297 unless (defined $new_seq->start) {
1298 if ($start == 0) {
1299 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1300 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1302 else {
1303 my $start_adjust = $orig =~ /^$gap+/;
1304 if ($start_adjust) {
1305 $start_adjust = $+[0] == $start;
1307 $new_seq->start($seq->start + $start_adjust);
1310 # end
1311 if (($end + 1) >= CORE::length($orig)) {
1312 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1313 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1315 else {
1316 $new_seq->end($seq->end);
1320 if ($new_seq->end < $new_seq->start) {
1321 # we removed all columns except for gaps: set to 0 to indicate no
1322 # sequence
1323 $new_seq->start(0);
1324 $new_seq->end(0);
1327 $new_seq->seq($sequence) if $sequence;
1328 push @new, $new_seq;
1330 # add the new seqs to the alignment
1331 foreach my $new(@new){
1332 $aln->add_seq($new);
1334 # fix for meta, sf, ann
1335 return $aln;
1338 sub _remove_columns_by_type {
1339 my ($self,$type) = @_;
1340 my $aln = $self->new;
1341 my @remove;
1343 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1344 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1345 my %matchchars = ( 'match' => '\*',
1346 'weak' => '\.',
1347 'strong' => ':',
1348 'mismatch' => ' ',
1349 'gaps' => '',
1350 'all_gaps_columns' => ''
1352 # get the characters to delete against
1353 my $del_char;
1354 foreach my $type (@{$type}){
1355 $del_char.= $matchchars{$type};
1358 my $length = 0;
1359 my $match_line = $self->match_line;
1360 # do the matching to get the segments to remove
1361 if($del_char){
1362 while($match_line =~ m/[$del_char]/g ){
1363 my $start = pos($match_line)-1;
1364 $match_line=~/\G[$del_char]+/gc;
1365 my $end = pos($match_line)-1;
1367 #have to offset the start and end for subsequent removes
1368 $start-=$length;
1369 $end -=$length;
1370 $length += ($end-$start+1);
1371 push @remove, [$start,$end];
1375 # remove the segments
1376 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1377 $aln = $aln->remove_gaps() if $gap;
1378 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1379 # fix for meta, sf, ann
1380 $aln;
1384 sub _remove_columns_by_num {
1385 my ($self,$positions) = @_;
1386 my $aln = $self->new;
1388 # sort the positions
1389 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1391 my @remove;
1392 my $length = 0;
1393 foreach my $pos (@{$positions}) {
1394 my ($start, $end) = @{$pos};
1396 #have to offset the start and end for subsequent removes
1397 $start-=$length;
1398 $end -=$length;
1399 $length += ($end-$start+1);
1400 push @remove, [$start,$end];
1403 #remove the segments
1404 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1405 # fix for meta, sf, ann
1406 $aln;
1410 =head1 Change sequences within the MSA
1412 These methods affect characters in all sequences without changing the
1413 alignment.
1415 =head2 splice_by_seq_pos
1417 Title : splice_by_seq_pos
1418 Usage : $status = splice_by_seq_pos(1);
1419 Function: splices all aligned sequences where the specified sequence
1420 has gaps.
1421 Example :
1422 Returns : 1 on success
1423 Args : position of sequence to splice by
1426 =cut
1428 sub splice_by_seq_pos{
1429 my ($self,$pos) = @_;
1431 my $guide = $self->get_seq_by_pos($pos);
1432 my $guide_seq = $guide->seq;
1434 $guide_seq =~ s/\./\-/g;
1436 my @gaps = ();
1437 $pos = -1;
1438 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1439 unshift @gaps, $pos;
1440 $pos++;
1443 foreach my $seq ($self->each_seq){
1444 my @bases = split '', $seq->seq;
1446 splice(@bases, $_, 1) foreach @gaps;
1447 $seq->seq(join('', @bases));
1453 =head2 map_chars
1455 Title : map_chars
1456 Usage : $ali->map_chars('\.','-')
1457 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1458 characters
1460 Notice that the from (arg1) is interpretted as a regex,
1461 so be careful about quoting meta characters (eg
1462 $ali->map_chars('.','-') wont do what you want)
1463 Returns :
1464 Argument : 'from' rexexp
1465 'to' string
1467 =cut
1469 sub map_chars {
1470 my $self = shift;
1471 my $from = shift;
1472 my $to = shift;
1473 my ($seq,$temp);
1475 $self->throw("Need exactly two arguments")
1476 unless defined $from and defined $to;
1478 foreach $seq ( $self->each_seq() ) {
1479 $temp = $seq->seq();
1480 $temp =~ s/$from/$to/g;
1481 $seq->seq($temp);
1483 return 1;
1487 =head2 uppercase
1489 Title : uppercase()
1490 Usage : $ali->uppercase()
1491 Function : Sets all the sequences to uppercase
1492 Returns :
1493 Argument :
1495 =cut
1497 sub uppercase {
1498 my $self = shift;
1499 my $seq;
1500 my $temp;
1502 foreach $seq ( $self->each_seq() ) {
1503 $temp = $seq->seq();
1504 $temp =~ tr/[a-z]/[A-Z]/;
1506 $seq->seq($temp);
1508 return 1;
1511 =head2 cigar_line
1513 Title : cigar_line()
1514 Usage : %cigars = $align->cigar_line()
1515 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1516 Report) line for each sequence in the alignment. Examples are
1517 "1,60" or "5,10:12,58", where the numbers refer to conserved
1518 positions within the alignment. The keys of the hash are the
1519 NSEs (name/start/end) assigned to each sequence.
1520 Args : threshold (optional, defaults to 100)
1521 Returns : Hash of strings (cigar lines)
1523 =cut
1525 sub cigar_line {
1526 my $self = shift;
1527 my $thr=shift||100;
1528 my %cigars;
1530 my @consensus = split "",($self->consensus_string($thr));
1531 my $len = $self->length;
1532 my $gapchar = $self->gap_char;
1534 # create a precursor, something like (1,4,5,6,7,33,45),
1535 # where each number corresponds to a conserved position
1536 foreach my $seq ( $self->each_seq ) {
1537 my @seq = split "", uc ($seq->seq);
1538 my $pos = 1;
1539 for (my $x = 0 ; $x < $len ; $x++ ) {
1540 if ($seq[$x] eq $consensus[$x]) {
1541 push @{$cigars{$seq->get_nse}},$pos;
1542 $pos++;
1543 } elsif ($seq[$x] ne $gapchar) {
1544 $pos++;
1548 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1549 for my $name (keys %cigars) {
1550 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1551 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1552 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1553 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1554 ${$cigars{$name}}[$#{$cigars{$name}}] );
1555 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1556 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1557 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1558 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1562 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1563 for my $name (keys %cigars) {
1564 my @remove;
1565 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1566 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1567 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1568 unshift @remove,$x;
1571 for my $pos (@remove) {
1572 splice @{$cigars{$name}}, $pos, 1;
1575 # join and punctuate
1576 for my $name (keys %cigars) {
1577 my ($start,$end,$str) = "";
1578 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1579 $str .= ($start . "," . $end . ":");
1581 $str =~ s/:$//;
1582 $cigars{$name} = $str;
1584 %cigars;
1588 =head2 match_line
1590 Title : match_line()
1591 Usage : $line = $align->match_line()
1592 Function : Generates a match line - much like consensus string
1593 except that a line indicating the '*' for a match.
1594 Args : (optional) Match line characters ('*' by default)
1595 (optional) Strong match char (':' by default)
1596 (optional) Weak match char ('.' by default)
1597 Returns : String
1599 =cut
1601 sub match_line {
1602 my ($self,$matchlinechar, $strong, $weak) = @_;
1603 my %matchchars = ('match' => $matchlinechar || '*',
1604 'weak' => $weak || '.',
1605 'strong' => $strong || ':',
1606 'mismatch' => ' ',
1609 my @seqchars;
1610 my $alphabet;
1611 foreach my $seq ( $self->each_seq ) {
1612 push @seqchars, [ split(//, uc ($seq->seq)) ];
1613 $alphabet = $seq->alphabet unless defined $alphabet;
1615 my $refseq = shift @seqchars;
1616 # let's just march down the columns
1617 my $matchline;
1618 POS:
1619 foreach my $pos ( 0..$self->length ) {
1620 my $refchar = $refseq->[$pos];
1621 my $char = $matchchars{'mismatch'};
1622 unless( defined $refchar ) {
1623 last if $pos == $self->length; # short circuit on last residue
1624 # this in place to handle jason's soon-to-be-committed
1625 # intron mapping code
1626 goto bottom;
1628 my %col = ($refchar => 1);
1629 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1630 foreach my $seq ( @seqchars ) {
1631 next if $pos >= scalar @$seq;
1632 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1633 $seq->[$pos] eq ' ' );
1634 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1636 my @colresidues = sort keys %col;
1638 # if all the values are the same
1639 if( $dash ) { $char = $matchchars{'mismatch'} }
1640 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1641 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1642 # matches for protein seqs
1643 TYPE:
1644 foreach my $type ( qw(strong weak) ) {
1645 # iterate through categories
1646 my %groups;
1647 # iterate through each of the aa in the col
1648 # look to see which groups it is in
1649 foreach my $c ( @colresidues ) {
1650 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1651 push @{$groups{$f}},$c;
1654 GRP:
1655 foreach my $cols ( values %groups ) {
1656 @$cols = sort @$cols;
1657 # now we are just testing to see if two arrays
1658 # are identical w/o changing either one
1659 # have to be same len
1660 next if( scalar @$cols != scalar @colresidues );
1661 # walk down the length and check each slot
1662 for($_=0;$_ < (scalar @$cols);$_++ ) {
1663 next GRP if( $cols->[$_] ne $colresidues[$_] );
1665 $char = $matchchars{$type};
1666 last TYPE;
1670 bottom:
1671 $matchline .= $char;
1673 return $matchline;
1677 =head2 gap_line
1679 Title : gap_line()
1680 Usage : $line = $align->gap_line()
1681 Function : Generates a gap line - much like consensus string
1682 except that a line where '-' represents gap
1683 Args : (optional) gap line characters ('-' by default)
1684 Returns : string
1686 =cut
1688 sub gap_line {
1689 my ($self,$gapchar) = @_;
1690 $gapchar = $gapchar || $self->gap_char;
1691 my %gap_hsh; # column gaps vector
1692 foreach my $seq ( $self->each_seq ) {
1693 my $i = 0;
1694 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1695 map {[$i++, $_]} split(//, uc ($seq->seq));
1697 my $gap_line;
1698 foreach my $pos ( 0..$self->length-1 ) {
1699 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1701 return $gap_line;
1704 =head2 all_gap_line
1706 Title : all_gap_line()
1707 Usage : $line = $align->all_gap_line()
1708 Function : Generates a gap line - much like consensus string
1709 except that a line where '-' represents all-gap column
1710 Args : (optional) gap line characters ('-' by default)
1711 Returns : string
1713 =cut
1715 sub all_gap_line {
1716 my ($self,$gapchar) = @_;
1717 $gapchar = $gapchar || $self->gap_char;
1718 my %gap_hsh; # column gaps counter hash
1719 my @seqs = $self->each_seq;
1720 foreach my $seq ( @seqs ) {
1721 my $i = 0;
1722 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1723 map {[$i++, $_]} split(//, uc ($seq->seq));
1725 my $gap_line;
1726 foreach my $pos ( 0..$self->length-1 ) {
1727 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1728 # gaps column
1729 $gap_line .= $gapchar;
1730 } else {
1731 $gap_line .= '.';
1734 return $gap_line;
1737 =head2 gap_col_matrix
1739 Title : gap_col_matrix()
1740 Usage : my $cols = $align->gap_col_matrix()
1741 Function : Generates an array of hashes where
1742 each entry in the array is a hash reference
1743 with keys of all the sequence names and
1744 and value of 1 or 0 if the sequence has a gap at that column
1745 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1747 =cut
1749 sub gap_col_matrix {
1750 my ($self,$gapchar) = @_;
1751 $gapchar = $gapchar || $self->gap_char;
1752 my %gap_hsh; # column gaps vector
1753 my @cols;
1754 foreach my $seq ( $self->each_seq ) {
1755 my $i = 0;
1756 my $str = $seq->seq;
1757 my $len = $seq->length;
1758 my $ch;
1759 my $id = $seq->display_id;
1760 while( $i < $len ) {
1761 $ch = substr($str, $i, 1);
1762 $cols[$i++]->{$id} = ($ch eq $gapchar);
1765 return \@cols;
1768 =head2 match
1770 Title : match()
1771 Usage : $ali->match()
1772 Function : Goes through all columns and changes residues that are
1773 identical to residue in first sequence to match '.'
1774 character. Sets match_char.
1776 USE WITH CARE: Most MSA formats do not support match
1777 characters in sequences, so this is mostly for output
1778 only. NEXUS format (Bio::AlignIO::nexus) can handle
1780 Returns : 1
1781 Argument : a match character, optional, defaults to '.'
1783 =cut
1785 sub match {
1786 my ($self, $match) = @_;
1788 $match ||= '.';
1789 my ($matching_char) = $match;
1790 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1791 $self->map_chars($matching_char, '-');
1793 my @seqs = $self->each_seq();
1794 return 1 unless scalar @seqs > 1;
1796 my $refseq = shift @seqs ;
1797 my @refseq = split //, $refseq->seq;
1798 my $gapchar = $self->gap_char;
1800 foreach my $seq ( @seqs ) {
1801 my @varseq = split //, $seq->seq();
1802 for ( my $i=0; $i < scalar @varseq; $i++) {
1803 $varseq[$i] = $match if defined $refseq[$i] &&
1804 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1805 $refseq[$i] =~ /$gapchar/ )
1806 && $refseq[$i] eq $varseq[$i];
1808 $seq->seq(join '', @varseq);
1810 $self->match_char($match);
1811 return 1;
1815 =head2 unmatch
1817 Title : unmatch()
1818 Usage : $ali->unmatch()
1819 Function : Undoes the effect of method match. Unsets match_char.
1820 Returns : 1
1821 Argument : a match character, optional, defaults to '.'
1823 See L<match> and L<match_char>
1825 =cut
1827 sub unmatch {
1828 my ($self, $match) = @_;
1830 $match ||= '.';
1832 my @seqs = $self->each_seq();
1833 return 1 unless scalar @seqs > 1;
1835 my $refseq = shift @seqs ;
1836 my @refseq = split //, $refseq->seq;
1837 my $gapchar = $self->gap_char;
1838 foreach my $seq ( @seqs ) {
1839 my @varseq = split //, $seq->seq();
1840 for ( my $i=0; $i < scalar @varseq; $i++) {
1841 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1842 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1843 $refseq[$i] =~ /$gapchar/ ) &&
1844 $varseq[$i] eq $match;
1846 $seq->seq(join '', @varseq);
1848 $self->match_char('');
1849 return 1;
1852 =head1 MSA attributes
1854 Methods for setting and reading the MSA attributes.
1856 Note that the methods defining character semantics depend on the user
1857 to set them sensibly. They are needed only by certain input/output
1858 methods. Unset them by setting to an empty string ('').
1860 =head2 id
1862 Title : id
1863 Usage : $myalign->id("Ig")
1864 Function : Gets/sets the id field of the alignment
1865 Returns : An id string
1866 Argument : An id string (optional)
1868 =cut
1870 sub id {
1871 my ($self, $name) = @_;
1873 if (defined( $name )) {
1874 $self->{'_id'} = $name;
1877 return $self->{'_id'};
1880 =head2 accession
1882 Title : accession
1883 Usage : $myalign->accession("PF00244")
1884 Function : Gets/sets the accession field of the alignment
1885 Returns : An acc string
1886 Argument : An acc string (optional)
1888 =cut
1890 sub accession {
1891 my ($self, $acc) = @_;
1893 if (defined( $acc )) {
1894 $self->{'_accession'} = $acc;
1897 return $self->{'_accession'};
1900 =head2 description
1902 Title : description
1903 Usage : $myalign->description("14-3-3 proteins")
1904 Function : Gets/sets the description field of the alignment
1905 Returns : An description string
1906 Argument : An description string (optional)
1908 =cut
1910 sub description {
1911 my ($self, $name) = @_;
1913 if (defined( $name )) {
1914 $self->{'_description'} = $name;
1917 return $self->{'_description'};
1920 =head2 missing_char
1922 Title : missing_char
1923 Usage : $myalign->missing_char("?")
1924 Function : Gets/sets the missing_char attribute of the alignment
1925 It is generally recommended to set it to 'n' or 'N'
1926 for nucleotides and to 'X' for protein.
1927 Returns : An missing_char string,
1928 Argument : An missing_char string (optional)
1930 =cut
1932 sub missing_char {
1933 my ($self, $char) = @_;
1935 if (defined $char ) {
1936 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1937 $self->{'_missing_char'} = $char;
1940 return $self->{'_missing_char'};
1943 =head2 match_char
1945 Title : match_char
1946 Usage : $myalign->match_char('.')
1947 Function : Gets/sets the match_char attribute of the alignment
1948 Returns : An match_char string,
1949 Argument : An match_char string (optional)
1951 =cut
1953 sub match_char {
1954 my ($self, $char) = @_;
1956 if (defined $char ) {
1957 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1958 $self->{'_match_char'} = $char;
1961 return $self->{'_match_char'};
1964 =head2 gap_char
1966 Title : gap_char
1967 Usage : $myalign->gap_char('-')
1968 Function : Gets/sets the gap_char attribute of the alignment
1969 Returns : An gap_char string, defaults to '-'
1970 Argument : An gap_char string (optional)
1972 =cut
1974 sub gap_char {
1975 my ($self, $char) = @_;
1977 if (defined $char || ! defined $self->{'_gap_char'} ) {
1978 $char= '-' unless defined $char;
1979 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1980 $self->{'_gap_char'} = $char;
1982 return $self->{'_gap_char'};
1985 =head2 symbol_chars
1987 Title : symbol_chars
1988 Usage : my @symbolchars = $aln->symbol_chars;
1989 Function: Returns all the seen symbols (other than gaps)
1990 Returns : array of characters that are the seen symbols
1991 Args : boolean to include the gap/missing/match characters
1993 =cut
1995 sub symbol_chars{
1996 my ($self,$includeextra) = @_;
1998 unless ($self->{'_symbols'}) {
1999 foreach my $seq ($self->each_seq) {
2000 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
2003 my %copy = %{$self->{'_symbols'}};
2004 if( ! $includeextra ) {
2005 foreach my $char ( $self->gap_char, $self->match_char,
2006 $self->missing_char) {
2007 delete $copy{$char} if( defined $char );
2010 return keys %copy;
2013 =head1 Alignment descriptors
2015 These read only methods describe the MSA in various ways.
2018 =head2 score
2020 Title : score
2021 Usage : $str = $ali->score()
2022 Function : get/set a score of the alignment
2023 Returns : a score for the alignment
2024 Argument : an optional score to set
2026 =cut
2028 sub score {
2029 my $self = shift;
2030 $self->{score} = shift if @_;
2031 return $self->{score};
2034 =head2 consensus_string
2036 Title : consensus_string
2037 Usage : $str = $ali->consensus_string($threshold_percent)
2038 Function : Makes a strict consensus
2039 Returns : Consensus string
2040 Argument : Optional threshold ranging from 0 to 100.
2041 The consensus residue has to appear at least threshold %
2042 of the sequences at a given location, otherwise a '?'
2043 character will be placed at that location.
2044 (Default value = 0%)
2046 =cut
2048 sub consensus_string {
2049 my $self = shift;
2050 my $threshold = shift;
2052 my $out = "";
2053 my $len = $self->length - 1;
2055 foreach ( 0 .. $len ) {
2056 $out .= $self->_consensus_aa($_,$threshold);
2058 return $out;
2061 =head2 consensus_conservation
2063 Title : consensus_conservation
2064 Usage : @conservation = $ali->consensus_conservation();
2065 Function : Conservation (as a percent) of each position of alignment
2066 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2067 Argument :
2069 =cut
2071 sub consensus_conservation {
2072 my $self = shift;
2073 my @cons;
2074 my $num_sequences = $self->num_sequences;
2075 foreach my $point (0..$self->length-1) {
2076 my %hash = $self->_consensus_counts($point);
2077 # max frequency of a non-gap letter
2078 my $max = (sort {$b<=>$a} values %hash )[0];
2079 push @cons, 100 * $max / $num_sequences;
2081 return @cons;
2084 sub _consensus_aa {
2085 my $self = shift;
2086 my $point = shift;
2087 my $threshold_percent = shift || -1 ;
2088 my ($seq,%hash,$count,$letter,$key);
2089 my $gapchar = $self->gap_char;
2090 %hash = $self->_consensus_counts($point);
2091 my $number_of_sequences = $self->num_sequences();
2092 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2093 $count = -1;
2094 $letter = '?';
2096 foreach $key ( sort keys %hash ) {
2097 # print "Now at $key $hash{$key}\n";
2098 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2099 $letter = $key;
2100 $count = $hash{$key};
2103 return $letter;
2106 # Frequency of each letter in one column
2107 sub _consensus_counts {
2108 my $self = shift;
2109 my $point = shift;
2110 my %hash;
2111 my $gapchar = $self->gap_char;
2112 foreach my $seq ( $self->each_seq() ) {
2113 my $letter = substr($seq->seq,$point,1);
2114 $self->throw("--$point-----------") if $letter eq '';
2115 ($letter eq $gapchar || $letter =~ /\./) && next;
2116 $hash{$letter}++;
2118 return %hash;
2122 =head2 consensus_iupac
2124 Title : consensus_iupac
2125 Usage : $str = $ali->consensus_iupac()
2126 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2127 and RNA. The output is in upper case except when gaps in
2128 a column force output to be in lower case.
2130 Note that if your alignment sequences contain a lot of
2131 IUPAC ambiquity codes you often have to manually set
2132 alphabet. Bio::PrimarySeq::_guess_type thinks they
2133 indicate a protein sequence.
2134 Returns : consensus string
2135 Argument : none
2136 Throws : on protein sequences
2138 =cut
2140 sub consensus_iupac {
2141 my $self = shift;
2142 my $out = "";
2143 my $len = $self->length-1;
2145 # only DNA and RNA sequences are valid
2146 foreach my $seq ( $self->each_seq() ) {
2147 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2148 if $seq->alphabet eq 'protein';
2150 # loop over the alignment columns
2151 foreach my $count ( 0 .. $len ) {
2152 $out .= $self->_consensus_iupac($count);
2154 return $out;
2157 sub _consensus_iupac {
2158 my ($self, $column) = @_;
2159 my ($string, $char, $rna);
2161 #determine all residues in a column
2162 foreach my $seq ( $self->each_seq() ) {
2163 $string .= substr($seq->seq, $column, 1);
2165 $string = uc $string;
2167 # quick exit if there's an N in the string
2168 if ($string =~ /N/) {
2169 $string =~ /\W/ ? return 'n' : return 'N';
2171 # ... or if there are only gap characters
2172 return '-' if $string =~ /^\W+$/;
2174 # treat RNA as DNA in regexps
2175 if ($string =~ /U/) {
2176 $string =~ s/U/T/;
2177 $rna = 1;
2180 # the following s///'s only need to be done to the _first_ ambiguity code
2181 # as we only need to see the _range_ of characters in $string
2183 if ($string =~ /[VDHB]/) {
2184 $string =~ s/V/AGC/;
2185 $string =~ s/D/AGT/;
2186 $string =~ s/H/ACT/;
2187 $string =~ s/B/CTG/;
2190 if ($string =~ /[SKYRWM]/) {
2191 $string =~ s/S/GC/;
2192 $string =~ s/K/GT/;
2193 $string =~ s/Y/CT/;
2194 $string =~ s/R/AG/;
2195 $string =~ s/W/AT/;
2196 $string =~ s/M/AC/;
2199 # and now the guts of the thing
2201 if ($string =~ /A/) {
2202 $char = 'A'; # A A
2203 if ($string =~ /G/) {
2204 $char = 'R'; # A and G (purines) R
2205 if ($string =~ /C/) {
2206 $char = 'V'; # A and G and C V
2207 if ($string =~ /T/) {
2208 $char = 'N'; # A and G and C and T N
2210 } elsif ($string =~ /T/) {
2211 $char = 'D'; # A and G and T D
2213 } elsif ($string =~ /C/) {
2214 $char = 'M'; # A and C M
2215 if ($string =~ /T/) {
2216 $char = 'H'; # A and C and T H
2218 } elsif ($string =~ /T/) {
2219 $char = 'W'; # A and T W
2221 } elsif ($string =~ /C/) {
2222 $char = 'C'; # C C
2223 if ($string =~ /T/) {
2224 $char = 'Y'; # C and T (pyrimidines) Y
2225 if ($string =~ /G/) {
2226 $char = 'B'; # C and T and G B
2228 } elsif ($string =~ /G/) {
2229 $char = 'S'; # C and G S
2231 } elsif ($string =~ /G/) {
2232 $char = 'G'; # G G
2233 if ($string =~ /C/) {
2234 $char = 'S'; # G and C S
2235 } elsif ($string =~ /T/) {
2236 $char = 'K'; # G and T K
2238 } elsif ($string =~ /T/) {
2239 $char = 'T'; # T T
2242 $char = 'U' if $rna and $char eq 'T';
2243 $char = lc $char if $string =~ /\W/;
2245 return $char;
2249 =head2 consensus_meta
2251 Title : consensus_meta
2252 Usage : $seqmeta = $ali->consensus_meta()
2253 Function : Returns a Bio::Seq::Meta object containing the consensus
2254 strings derived from meta data analysis.
2255 Returns : Bio::Seq::Meta
2256 Argument : Bio::Seq::Meta
2257 Throws : non-MetaI object
2259 =cut
2261 sub consensus_meta {
2262 my ($self, $meta) = @_;
2263 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2264 $self->throw('Not a Bio::Seq::MetaI object');
2266 return $self->{'_aln_meta'} = $meta if $meta;
2267 return $self->{'_aln_meta'}
2270 =head2 is_flush
2272 Title : is_flush
2273 Usage : if ( $ali->is_flush() )
2274 Function : Tells you whether the alignment
2275 : is flush, i.e. all of the same length
2276 Returns : 1 or 0
2277 Argument :
2279 =cut
2281 sub is_flush {
2282 my ($self,$report) = @_;
2283 my $seq;
2284 my $length = (-1);
2285 my $temp;
2287 foreach $seq ( $self->each_seq() ) {
2288 if( $length == (-1) ) {
2289 $length = CORE::length($seq->seq());
2290 next;
2293 $temp = CORE::length($seq->seq());
2294 if( $temp != $length ) {
2295 $self->warn("expecting $length not $temp from ".
2296 $seq->display_id) if( $report );
2297 $self->debug("expecting $length not $temp from ".
2298 $seq->display_id);
2299 $self->debug($seq->seq(). "\n");
2300 return 0;
2304 return 1;
2308 =head2 length
2310 Title : length()
2311 Usage : $len = $ali->length()
2312 Function : Returns the maximum length of the alignment.
2313 To be sure the alignment is a block, use is_flush
2314 Returns : Integer
2315 Argument :
2317 =cut
2319 sub length_aln {
2320 my $self = shift;
2321 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2322 $self->length(@_);
2325 sub length {
2326 my $self = shift;
2327 my $seq;
2328 my $length = -1;
2329 my $temp;
2331 foreach $seq ( $self->each_seq() ) {
2332 $temp = $seq->length();
2333 if( $temp > $length ) {
2334 $length = $temp;
2338 return $length;
2342 =head2 maxdisplayname_length
2344 Title : maxdisplayname_length
2345 Usage : $ali->maxdisplayname_length()
2346 Function : Gets the maximum length of the displayname in the
2347 alignment. Used in writing out various MSA formats.
2348 Returns : integer
2349 Argument :
2351 =cut
2353 sub maxname_length {
2354 my $self = shift;
2355 $self->deprecated("maxname_length - deprecated method.".
2356 " Use maxdisplayname_length() instead.");
2357 $self->maxdisplayname_length();
2360 sub maxnse_length {
2361 my $self = shift;
2362 $self->deprecated("maxnse_length - deprecated method.".
2363 " Use maxnse_length() instead.");
2364 $self->maxdisplayname_length();
2367 sub maxdisplayname_length {
2368 my $self = shift;
2369 my $maxname = (-1);
2370 my ($seq,$len);
2372 foreach $seq ( $self->each_seq() ) {
2373 $len = CORE::length $self->displayname($seq->get_nse());
2375 if( $len > $maxname ) {
2376 $maxname = $len;
2380 return $maxname;
2383 =head2 max_metaname_length
2385 Title : max_metaname_length
2386 Usage : $ali->max_metaname_length()
2387 Function : Gets the maximum length of the meta name tags in the
2388 alignment for the sequences and for the alignment.
2389 Used in writing out various MSA formats.
2390 Returns : integer
2391 Argument : None
2393 =cut
2395 sub max_metaname_length {
2396 my $self = shift;
2397 my $maxname = (-1);
2398 my ($seq,$len);
2400 # check seq meta first
2401 for $seq ( $self->each_seq() ) {
2402 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2403 for my $mtag ($seq->meta_names) {
2404 $len = CORE::length $mtag;
2405 if( $len > $maxname ) {
2406 $maxname = $len;
2411 # alignment meta
2412 for my $meta ($self->consensus_meta) {
2413 next unless $meta;
2414 for my $name ($meta->meta_names) {
2415 $len = CORE::length $name;
2416 if( $len > $maxname ) {
2417 $maxname = $len;
2422 return $maxname;
2425 =head2 num_residues
2427 Title : num_residues
2428 Usage : $no = $ali->num_residues
2429 Function : number of residues in total in the alignment
2430 Returns : integer
2431 Argument :
2432 Note : replaces no_residues()
2434 =cut
2436 sub num_residues {
2437 my $self = shift;
2438 my $count = 0;
2440 foreach my $seq ($self->each_seq) {
2441 my $str = $seq->seq();
2443 $count += ($str =~ s/[A-Za-z]//g);
2446 return $count;
2449 =head2 num_sequences
2451 Title : num_sequences
2452 Usage : $depth = $ali->num_sequences
2453 Function : number of sequence in the sequence alignment
2454 Returns : integer
2455 Argument : none
2456 Note : replaces no_sequences()
2458 =cut
2460 sub num_sequences {
2461 my $self = shift;
2462 return scalar($self->each_seq);
2466 =head2 average_percentage_identity
2468 Title : average_percentage_identity
2469 Usage : $id = $align->average_percentage_identity
2470 Function: The function uses a fast method to calculate the average
2471 percentage identity of the alignment
2472 Returns : The average percentage identity of the alignment
2473 Args : None
2474 Notes : This method implemented by Kevin Howe calculates a figure that is
2475 designed to be similar to the average pairwise identity of the
2476 alignment (identical in the absence of gaps), without having to
2477 explicitly calculate pairwise identities proposed by Richard Durbin.
2478 Validated by Ewan Birney ad Alex Bateman.
2480 =cut
2482 sub average_percentage_identity{
2483 my ($self,@args) = @_;
2485 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2486 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2488 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2490 if (! $self->is_flush()) {
2491 $self->throw("All sequences in the alignment must be the same length");
2494 @seqs = $self->each_seq();
2495 $len = $self->length();
2497 # load the each hash with correct keys for existence checks
2499 for( my $index=0; $index < $len; $index++) {
2500 foreach my $letter (@alphabet) {
2501 $countHashes[$index]->{$letter} = 0;
2504 foreach my $seq (@seqs) {
2505 my @seqChars = split //, $seq->seq();
2506 for( my $column=0; $column < @seqChars; $column++ ) {
2507 my $char = uc($seqChars[$column]);
2508 if (exists $countHashes[$column]->{$char}) {
2509 $countHashes[$column]->{$char}++;
2514 $total = 0;
2515 $divisor = 0;
2516 for(my $column =0; $column < $len; $column++) {
2517 my %hash = %{$countHashes[$column]};
2518 $subdivisor = 0;
2519 foreach my $res (keys %hash) {
2520 $total += $hash{$res}*($hash{$res} - 1);
2521 $subdivisor += $hash{$res};
2523 $divisor += $subdivisor * ($subdivisor - 1);
2525 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2528 =head2 percentage_identity
2530 Title : percentage_identity
2531 Usage : $id = $align->percentage_identity
2532 Function: The function calculates the average percentage identity
2533 (aliased to average_percentage_identity)
2534 Returns : The average percentage identity
2535 Args : None
2537 =cut
2539 sub percentage_identity {
2540 my $self = shift;
2541 return $self->average_percentage_identity();
2544 =head2 overall_percentage_identity
2546 Title : overall_percentage_identity
2547 Usage : $id = $align->overall_percentage_identity
2548 $id = $align->overall_percentage_identity('short')
2549 Function: The function calculates the percentage identity of
2550 the conserved columns
2551 Returns : The percentage identity of the conserved columns
2552 Args : length value to use, optional defaults to alignment length
2553 possible values: 'align', 'short', 'long'
2555 The argument values 'short' and 'long' refer to shortest and longest
2556 sequence in the alignment. Method modification code by Hongyu Zhang.
2558 =cut
2560 sub overall_percentage_identity{
2561 my ($self, $length_measure) = @_;
2563 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2565 my %enum = map {$_ => undef} qw (align short long);
2567 $self->throw("Unknown argument [$length_measure]")
2568 if $length_measure and not exists $enum{$length_measure};
2569 $length_measure ||= 'align';
2571 if (! $self->is_flush()) {
2572 $self->throw("All sequences in the alignment must be the same length");
2575 # Count the residues seen at each position
2576 my $len;
2577 my $total = 0; # number of positions with identical residues
2578 my @countHashes;
2579 my @seqs = $self->each_seq;
2580 my $nof_seqs = scalar @seqs;
2581 my $aln_len = $self->length();
2582 for my $seq (@seqs) {
2583 my $seqstr = $seq->seq;
2585 # Count residues for given sequence
2586 for my $column (0 .. $aln_len-1) {
2587 my $char = uc( substr($seqstr, $column, 1) );
2588 if ( exists $alphabet{$char} ) {
2590 # This is a valid char
2591 if ( defined $countHashes[$column]->{$char} ) {
2592 $countHashes[$column]->{$char}++;
2593 } else {
2594 $countHashes[$column]->{$char} = 1;
2597 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2598 # All sequences have this same residue
2599 $total++;
2605 # Sequence length
2606 if ($length_measure eq 'short' || $length_measure eq 'long') {
2607 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2608 if ($length_measure eq 'short') {
2609 if ( (not defined $len) || ($seq_len < $len) ) {
2610 $len = $seq_len;
2612 } elsif ($length_measure eq 'long') {
2613 if ( (not defined $len) || ($seq_len > $len) ) {
2614 $len = $seq_len;
2621 if ($length_measure eq 'align') {
2622 $len = $aln_len;
2625 return ($total / $len ) * 100.0;
2630 =head1 Alignment positions
2632 Methods to map a sequence position into an alignment column and back.
2633 column_from_residue_number() does the former. The latter is really a
2634 property of the sequence object and can done using
2635 L<Bio::LocatableSeq::location_from_column>:
2637 # select somehow a sequence from the alignment, e.g.
2638 my $seq = $aln->get_seq_by_pos(1);
2639 #$loc is undef or Bio::LocationI object
2640 my $loc = $seq->location_from_column(5);
2642 =head2 column_from_residue_number
2644 Title : column_from_residue_number
2645 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2646 Function: This function gives the position in the alignment
2647 (i.e. column number) of the given residue number in the
2648 sequence with the given name. For example, for the
2649 alignment
2651 Seq1/91-97 AC..DEF.GH.
2652 Seq2/24-30 ACGG.RTY...
2653 Seq3/43-51 AC.DDEF.GHI
2655 column_from_residue_number( "Seq1", 94 ) returns 6.
2656 column_from_residue_number( "Seq2", 25 ) returns 2.
2657 column_from_residue_number( "Seq3", 50 ) returns 10.
2659 An exception is thrown if the residue number would lie
2660 outside the length of the aligment
2661 (e.g. column_from_residue_number( "Seq2", 22 )
2663 Note: If the the parent sequence is represented by more than
2664 one alignment sequence and the residue number is present in
2665 them, this method finds only the first one.
2667 Returns : A column number for the position in the alignment of the
2668 given residue in the given sequence (1 = first column)
2669 Args : A sequence id/name (not a name/start-end)
2670 A residue number in the whole sequence (not just that
2671 segment of it in the alignment)
2673 =cut
2675 sub column_from_residue_number {
2676 my ($self, $name, $resnumber) = @_;
2678 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2679 $self->throw("Second argument residue number missing") unless $resnumber;
2681 foreach my $seq ($self->each_seq_with_id($name)) {
2682 my $col;
2683 eval {
2684 $col = $seq->column_from_residue_number($resnumber);
2686 next if $@;
2687 return $col;
2690 $self->throw("Could not find a sequence segment in $name ".
2691 "containing residue number $resnumber");
2695 =head1 Sequence names
2697 Methods to manipulate the display name. The default name based on the
2698 sequence id and subsequence positions can be overridden in various
2699 ways.
2701 =head2 displayname
2703 Title : displayname
2704 Usage : $myalign->displayname("Ig", "IgA")
2705 Function : Gets/sets the display name of a sequence in the alignment
2706 Returns : A display name string
2707 Argument : name of the sequence
2708 displayname of the sequence (optional)
2710 =cut
2712 sub get_displayname {
2713 my $self = shift;
2714 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2715 $self->displayname(@_);
2718 sub set_displayname {
2719 my $self = shift;
2720 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2721 $self->displayname(@_);
2724 sub displayname {
2725 my ($self, $name, $disname) = @_;
2727 $self->throw("No sequence with name [$name]")
2728 unless defined $self->{'_seq'}->{$name};
2730 if( $disname and $name) {
2731 $self->{'_dis_name'}->{$name} = $disname;
2732 return $disname;
2734 elsif( defined $self->{'_dis_name'}->{$name} ) {
2735 return $self->{'_dis_name'}->{$name};
2736 } else {
2737 return $name;
2741 =head2 set_displayname_count
2743 Title : set_displayname_count
2744 Usage : $ali->set_displayname_count
2745 Function : Sets the names to be name_# where # is the number of
2746 times this name has been used.
2747 Returns : 1, on success
2748 Argument :
2750 =cut
2752 sub set_displayname_count {
2753 my $self= shift;
2754 my (@arr,$name,$seq,$count,$temp,$nse);
2756 foreach $seq ( $self->each_alphabetically() ) {
2757 $nse = $seq->get_nse();
2759 #name will be set when this is the second
2760 #time (or greater) is has been seen
2762 if( defined $name and $name eq ($seq->id()) ) {
2763 $temp = sprintf("%s_%s",$name,$count);
2764 $self->displayname($nse,$temp);
2765 $count++;
2766 } else {
2767 $count = 1;
2768 $name = $seq->id();
2769 $temp = sprintf("%s_%s",$name,$count);
2770 $self->displayname($nse,$temp);
2771 $count++;
2774 return 1;
2777 =head2 set_displayname_flat
2779 Title : set_displayname_flat
2780 Usage : $ali->set_displayname_flat()
2781 Function : Makes all the sequences be displayed as just their name,
2782 not name/start-end
2783 Returns : 1
2784 Argument :
2786 =cut
2788 sub set_displayname_flat {
2789 my $self = shift;
2790 my ($nse,$seq);
2792 foreach $seq ( $self->each_seq() ) {
2793 $nse = $seq->get_nse();
2794 $self->displayname($nse,$seq->id());
2796 return 1;
2799 =head2 set_displayname_normal
2801 Title : set_displayname_normal
2802 Usage : $ali->set_displayname_normal()
2803 Function : Makes all the sequences be displayed as name/start-end
2804 Returns : 1, on success
2805 Argument :
2807 =cut
2809 sub set_displayname_normal {
2810 my $self = shift;
2811 my ($nse,$seq);
2813 foreach $seq ( $self->each_seq() ) {
2814 $nse = $seq->get_nse();
2815 $self->displayname($nse,$nse);
2817 return 1;
2820 =head2 source
2822 Title : source
2823 Usage : $obj->source($newval)
2824 Function: sets the Alignment source program
2825 Example :
2826 Returns : value of source
2827 Args : newvalue (optional)
2830 =cut
2832 sub source{
2833 my ($self,$value) = @_;
2834 if( defined $value) {
2835 $self->{'_source'} = $value;
2837 return $self->{'_source'};
2840 =head2 set_displayname_safe
2842 Title : set_displayname_safe
2843 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2844 Function : Assign machine-generated serial names to sequences in input order.
2845 Designed to protect names during PHYLIP runs. Assign 10-char string
2846 in the form of "S000000001" to "S999999999". Restore the original
2847 names using "restore_displayname".
2848 Returns : 1. a new $aln with system names;
2849 2. a hash ref for restoring names
2850 Argument : Number for id length (default 10)
2852 =cut
2854 sub set_displayname_safe {
2855 my $self = shift;
2856 my $idlength = shift || 10;
2857 my ($seq, %phylip_name);
2858 my $ct=0;
2859 my $new=Bio::SimpleAlign->new();
2860 foreach $seq ( $self->each_seq() ) {
2861 $ct++;
2862 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2863 $phylip_name{$pname}=$seq->id();
2864 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2865 -seq => $seq->seq(),
2866 -alphabet => $seq->alphabet,
2867 -start => $seq->{_start},
2868 -end => $seq->{_end}
2870 $new->add_seq($new_seq);
2873 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2874 return ($new, \%phylip_name);
2877 =head2 restore_displayname
2879 Title : restore_displayname
2880 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2881 Function : Restore original sequence names (after running
2882 $ali->set_displayname_safe)
2883 Returns : a new $aln with names restored.
2884 Argument : a hash reference of names from "set_displayname_safe".
2886 =cut
2888 sub restore_displayname {
2889 my $self = shift;
2890 my $ref=shift;
2891 my %name=%$ref;
2892 my $new=Bio::SimpleAlign->new();
2893 foreach my $seq ( $self->each_seq() ) {
2894 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2895 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2896 -seq => $seq->seq(),
2897 -alphabet => $seq->alphabet,
2898 -start => $seq->{_start},
2899 -end => $seq->{_end}
2901 $new->add_seq($new_seq);
2903 return $new;
2906 =head2 sort_by_start
2908 Title : sort_by_start
2909 Usage : $ali->sort_by_start
2910 Function : Changes the order of the alignment to the start position of each
2911 subalignment
2912 Returns :
2913 Argument :
2915 =cut
2917 sub sort_by_start {
2918 my $self = shift;
2919 my ($seq,$nse,@arr,%hash,$count);
2920 foreach $seq ( $self->each_seq() ) {
2921 $nse = $seq->get_nse;
2922 $hash{$nse} = $seq;
2924 $count = 0;
2925 %{$self->{'_order'}} = (); # reset the hash;
2926 foreach $nse ( sort _startend keys %hash) {
2927 $self->{'_order'}->{$count} = $nse;
2928 $count++;
2933 sub _startend
2935 my ($aname,$arange) = split (/[\/]/,$a);
2936 my ($bname,$brange) = split (/[\/]/,$b);
2937 my ($astart,$aend) = split(/\-/,$arange);
2938 my ($bstart,$bend) = split(/\-/,$brange);
2939 return $astart <=> $bstart;
2942 =head2 bracket_string
2944 Title : bracket_string
2945 Usage : my @params = (-refseq => 'testseq',
2946 -allele1 => 'allele1',
2947 -allele2 => 'allele2',
2948 -delimiters => '{}',
2949 -separator => '/');
2950 $str = $aln->bracket_string(@params)
2952 Function : When supplied with a list of parameters (see below), returns a
2953 string in BIC format. This is used for allelic comparisons.
2954 Briefly, if either allele contains a base change when compared to
2955 the refseq, the base or gap for each allele is represented in
2956 brackets in the order present in the 'alleles' parameter.
2958 For the following data:
2960 >testseq
2961 GGATCCATTGCTACT
2962 >allele1
2963 GGATCCATTCCTACT
2964 >allele2
2965 GGAT--ATTCCTCCT
2967 the returned string with parameters 'refseq => testseq' and
2968 'alleles => [qw(allele1 allele2)]' would be:
2970 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2971 Returns : BIC-formatted string
2972 Argument : Required args
2973 refseq : string (ID) of the reference sequence used
2974 as basis for comparison
2975 allele1 : string (ID) of the first allele
2976 allele2 : string (ID) of the second allele
2977 Optional args
2978 delimiters: two symbol string of left and right delimiters.
2979 Only the first two symbols are used
2980 default = '[]'
2981 separator : string used as a separator. Only the first
2982 symbol is used
2983 default = '/'
2984 Throws : On no refseq/alleles, or invalid refseq/alleles.
2986 =cut
2988 sub bracket_string {
2989 my ($self, @args) = @_;
2990 my ($ref, $a1, $a2, $delim, $sep) =
2991 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2992 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2993 my ($ld, $rd);
2994 ($ld, $rd) = split('', $delim, 2) if $delim;
2995 $ld ||= '[';
2996 $rd ||= ']';
2997 $sep ||= '/';
2998 my ($refseq, $allele1, $allele2) =
2999 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
3000 if (!$refseq || !$allele1 || !$allele2) {
3001 $self->throw("One of your refseq/allele IDs is invalid!");
3003 my $len = $self->length-1;
3004 my $bic = '';
3005 # loop over the alignment columns
3006 for my $column ( 0 .. $len ) {
3007 my $string;
3008 my ($compres, $res1, $res2) =
3009 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3010 # are any of the allele symbols different from the refseq?
3011 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3012 $ld.$res1.$sep.$res2.$rd;
3013 $bic .= $string;
3015 return $bic;
3019 =head2 methods implementing Bio::FeatureHolderI
3021 FeatureHolderI implementation to support labeled character sets like one
3022 would get from NEXUS represented data.
3024 =head2 get_SeqFeatures
3026 Usage : @features = $aln->get_SeqFeatures
3027 Function: Get the feature objects held by this feature holder.
3028 Example :
3029 Returns : an array of Bio::SeqFeatureI implementing objects
3030 Args : optional filter coderef, taking a Bio::SeqFeatureI
3031 : as argument, returning TRUE if wanted, FALSE if
3032 : unwanted
3034 =cut
3036 sub get_SeqFeatures {
3037 my $self = shift;
3038 my $filter_cb = shift;
3039 $self->throw("Arg (filter callback) must be a coderef") unless
3040 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
3041 if( !defined $self->{'_as_feat'} ) {
3042 $self->{'_as_feat'} = [];
3044 if ($filter_cb) {
3045 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
3047 return @{$self->{'_as_feat'}};
3050 =head2 add_SeqFeature
3052 Usage : $aln->add_SeqFeature($subfeat);
3053 Function: adds a SeqFeature into the SeqFeature array.
3054 Example :
3055 Returns : true on success
3056 Args : a Bio::SeqFeatureI object
3057 Note : This implementation is not compliant
3058 with Bio::FeatureHolderI
3060 =cut
3062 sub add_SeqFeature {
3063 my ($self,@feat) = @_;
3065 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3067 foreach my $feat ( @feat ) {
3068 if( !$feat->isa("Bio::SeqFeatureI") ) {
3069 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3072 push(@{$self->{'_as_feat'}},$feat);
3074 return 1;
3078 =head2 remove_SeqFeatures
3080 Usage : $obj->remove_SeqFeatures
3081 Function: Removes all SeqFeatures. If you want to remove only a subset,
3082 remove that subset from the returned array, and add back the rest.
3083 Returns : The array of Bio::SeqFeatureI features that was
3084 deleted from this alignment.
3085 Args : none
3087 =cut
3089 sub remove_SeqFeatures {
3090 my $self = shift;
3092 return () unless $self->{'_as_feat'};
3093 my @feats = @{$self->{'_as_feat'}};
3094 $self->{'_as_feat'} = [];
3095 return @feats;
3098 =head2 feature_count
3100 Title : feature_count
3101 Usage : $obj->feature_count()
3102 Function: Return the number of SeqFeatures attached to the alignment
3103 Returns : integer representing the number of SeqFeatures
3104 Args : None
3106 =cut
3108 sub feature_count {
3109 my ($self) = @_;
3111 if (defined($self->{'_as_feat'})) {
3112 return ($#{$self->{'_as_feat'}} + 1);
3113 } else {
3114 return 0;
3118 =head2 get_all_SeqFeatures
3120 Title : get_all_SeqFeatures
3121 Usage :
3122 Function: Get all SeqFeatures.
3123 Example :
3124 Returns : an array of Bio::SeqFeatureI implementing objects
3125 Args : none
3126 Note : Falls through to Bio::FeatureHolderI implementation.
3128 =cut
3130 =head2 methods for Bio::AnnotatableI
3132 AnnotatableI implementation to support sequence alignments which
3133 contain annotation (NEXUS, Stockholm).
3135 =head2 annotation
3137 Title : annotation
3138 Usage : $ann = $aln->annotation or
3139 $aln->annotation($ann)
3140 Function: Gets or sets the annotation
3141 Returns : Bio::AnnotationCollectionI object
3142 Args : None or Bio::AnnotationCollectionI object
3144 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3145 for more information
3147 =cut
3149 sub annotation {
3150 my ($obj,$value) = @_;
3151 if( defined $value ) {
3152 $obj->throw("object of class ".ref($value)." does not implement ".
3153 "Bio::AnnotationCollectionI. Too bad.")
3154 unless $value->isa("Bio::AnnotationCollectionI");
3155 $obj->{'_annotation'} = $value;
3156 } elsif( ! defined $obj->{'_annotation'}) {
3157 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3159 return $obj->{'_annotation'};
3162 =head1 Deprecated methods
3164 =cut
3166 =head2 no_residues
3168 Title : no_residues
3169 Usage : $no = $ali->no_residues
3170 Function : number of residues in total in the alignment
3171 Returns : integer
3172 Argument :
3173 Note : deprecated in favor of num_residues()
3175 =cut
3177 sub no_residues {
3178 my $self = shift;
3179 $self->deprecated(-warn_version => 1.0069,
3180 -throw_version => 1.0075,
3181 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3182 $self->num_residues(@_);
3185 =head2 no_sequences
3187 Title : no_sequences
3188 Usage : $depth = $ali->no_sequences
3189 Function : number of sequence in the sequence alignment
3190 Returns : integer
3191 Argument :
3192 Note : deprecated in favor of num_sequences()
3194 =cut
3196 sub no_sequences {
3197 my $self = shift;
3198 $self->deprecated(-warn_version => 1.0069,
3199 -throw_version => 1.0075,
3200 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3201 $self->num_sequences(@_);
3204 =head2 mask_columns
3206 Title : mask_columns
3207 Usage : $aln2 = $aln->mask_columns(20,30)
3208 Function : Masks a slice of the alignment inclusive of start and
3209 end columns, and the first column in the alignment is denoted 1.
3210 Mask beyond the length of the sequence does not do padding.
3211 Returns : A Bio::SimpleAlign object
3212 Args : Positive integer for start column, positive integer for end column,
3213 optional string value use for the mask. Example:
3215 $aln2 = $aln->mask_columns(20,30,'?')
3216 Note : Masking must use a character that is not used for gaps or
3217 frameshifts. These can be adjusted using the relevant global
3218 variables, but be aware these may be (uncontrollably) modified
3219 elsewhere within BioPerl (see bug 2715)
3221 =cut
3223 sub mask_columns {
3224 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3225 my $self = shift;
3227 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3228 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3230 # coordinates are alignment-based, not sequence-based
3231 my ($start, $end, $mask_char) = @_;
3232 unless (defined $mask_char) { $mask_char = 'N' }
3234 $self->throw("Mask start has to be a positive integer and less than ".
3235 "alignment length, not [$start]")
3236 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3237 $self->throw("Mask end has to be a positive integer and less than ".
3238 "alignment length, not [$end]")
3239 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3240 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3241 "end [$end]") unless $start <= $end;
3242 $self->throw("Mask character $mask_char has to be a single character ".
3243 "and not a gap or frameshift symbol")
3244 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3246 my $aln = $self->new;
3247 $aln->id($self->id);
3248 foreach my $seq ( $self->each_seq() ) {
3249 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3250 -alphabet => $seq->alphabet,
3251 -strand => $seq->strand,
3252 -verbose => $self->verbose);
3254 # convert from 1-based alignment coords!
3255 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3256 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3257 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3258 $new_seq->seq($new_dna_string);
3259 $aln->add_seq($new_seq);
3261 return $aln;