Bio/SimpleAlign.pm

   1 # $Id$
   2 # BioPerl module for SimpleAlign
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13 #
  14 #  History:
  15 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  16 #       May 2001 major rewrite - Heikki Lehvaslaiho
  17
  18 =head1 NAME
  19
  20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  21
  22 =head1 SYNOPSIS
  23
  24   # Use Bio::AlignIO to read in the alignment
  25   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  26   $aln = $str->next_aln();
  27
  28   # Describe
  29   print $aln->length;
  30   print $aln->num_residues;
  31   print $aln->is_flush;
  32   print $aln->num_sequences;
  33   print $aln->score;
  34   print $aln->percentage_identity;
  35   print $aln->consensus_string(50);
  36
  37   # Find the position in the alignment for a sequence location
  38   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  39
  40   # Extract sequences and check values for the alignment column $pos
  41   foreach $seq ($aln->each_seq) {
  42       $res = $seq->subseq($pos, $pos);
  43       $count{$res}++;
  44   }
  45   foreach $res (keys %count) {
  46       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  47   }
  48
  49   # Manipulate
  50   $aln->remove_seq($seq);
  51   $mini_aln = $aln->slice(20,30);  # get a block of columns
  52   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  53   $new_aln = $aln->remove_columns([20,30]); # remove by position
  54   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  55
  56   # Analyze
  57   $str = $aln->consensus_string($threshold_percent);
  58   $str = $aln->match_line();
  59   $str = $aln->cigar_line();
  60   $id = $aln->percentage_identity;
  61
  62   # See the module documentation for details and more methods.
  63
  64 =head1 DESCRIPTION
  65
  66 SimpleAlign is an object that handles a multiple sequence alignment
  67 (MSA). It is very permissive of types (it does not insist on sequences
  68 being all same length, for example). Think of it as a set of sequences
  69 with a whole series of built-in manipulations and methods for reading and
  70 writing alignments.
  71
  72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  73 to store its sequences. These are subsequences with a start and end
  74 positions in the parent reference sequence. Each sequence in the
  75 SimpleAlign object is a Bio::LocatableSeq.
  76
  77 SimpleAlign expects the combination of name, start, and end for a
  78 given sequence to be unique in the alignment, and this is the key for the
  79 internal hashes (name, start, end are abbreviated C<nse> in the code).
  80 However, in some cases people do not want the name/start-end to be displayed:
  81 either multiple names in an alignment or names specific to the alignment
  82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  83 C<displayname>, and generally is what is used to print out the
  84 alignment. They default to name/start-end.
  85
  86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  87
  88 =head1 FEEDBACK
  89
  90 =head2 Mailing Lists
  91
  92 User feedback is an integral part of the evolution of this and other
  93 Bioperl modules.  Send your comments and suggestions preferably to one
  94 of the Bioperl mailing lists.  Your participation is much appreciated.
  95
  96   bioperl-l@bioperl.org                  - General discussion
  97   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  98
  99 =head2 Support
 100
 101 Please direct usage questions or support issues to the mailing list:
 102
 103 I<bioperl-l@bioperl.org>
 104
 105 rather than to the module maintainer directly. Many experienced and
 106 reponsive experts will be able look at the problem and quickly
 107 address it. Please include a thorough description of the problem
 108 with code and data examples if at all possible.
 109
 110 =head2 Reporting Bugs
 111
 112 Report bugs to the Bioperl bug tracking system to help us keep track
 113 the bugs and their resolution. Bug reports can be submitted via the
 114 web:
 115
 116   http://bugzilla.open-bio.org/
 117
 118 =head1 AUTHOR
 119
 120 Ewan Birney, birney@ebi.ac.uk
 121
 122 =head1 CONTRIBUTORS
 123
 124 Allen Day, allenday-at-ucla.edu,
 125 Richard Adams, Richard.Adams-at-ed.ac.uk,
 126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 128 Allen Smith, allens-at-cpan.org,
 129 Jason Stajich, jason-at-bioperl.org,
 130 Anthony Underwood, aunderwood-at-phls.org.uk,
 131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 132 Brian Osborne, bosborne at alum.mit.edu
 133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 134 Hongyu Zhang, forward at hongyu.org
 135 Jay Hannah, jay at jays.net
 136 Alexandr Bezginov, albezg at gmail.com
 137
 138 =head1 SEE ALSO
 139
 140 L<Bio::LocatableSeq>
 141
 142 =head1 APPENDIX
 143
 144 The rest of the documentation details each of the object
 145 methods. Internal methods are usually preceded with a _
 146
 147 =cut
 148
 149 # 'Let the code begin...
 150
 151 package Bio::SimpleAlign;
 152 use vars qw(%CONSERVATION_GROUPS);
 153 use strict;
 154
 155 use Bio::LocatableSeq;  # uses Seq's as list
 156
 157 use Bio::Seq;
 158 use Bio::SeqFeature::Generic;
 159
 160 BEGIN {
 161     # This data should probably be in a more centralized module...
 162     # it is taken from Clustalw documentation.
 163     # These are all the positively scoring groups that occur in the
 164     # Gonnet Pam250 matrix. The strong and weak groups are
 165     # defined as strong score >0.5 and weak score =<0.5 respectively.
 166
 167     %CONSERVATION_GROUPS = (
 168             'strong' => [ qw(
 169                                                  STA
 170                                                  NEQK
 171                                                  NHQK
 172                                                  NDEQ
 173                                                  QHRK
 174                                                  MILV
 175                                                  MILF
 176                                                  HY
 177                                                  FYW )],
 178                                 'weak' => [ qw(
 179                       CSA
 180                                                ATV
 181                                                SAG
 182                                                STNK
 183                                                STPA
 184                                                SGND
 185                                                SNDEQK
 186                                                NDEQHK
 187                                                NEQHRK
 188                                                FVLIM
 189                                                HFY )],);
 190 }
 191
 192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 193             Bio::FeatureHolderI);
 194
 195 =head2 new
 196
 197  Title     : new
 198  Usage     : my $aln = Bio::SimpleAlign->new();
 199  Function  : Creates a new simple align object
 200  Returns   : Bio::SimpleAlign
 201  Args      : -source     => string representing the source program
 202                             where this alignment came from
 203              -annotation => Bio::AnnotationCollectionI
 204              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 205              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 206              -consensus  => consensus string
 207              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 208
 209 =cut
 210
 211
 212 sub new {
 213   my($class,@args) = @_;
 214
 215   my $self = $class->SUPER::new(@args);
 216
 217   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 218                                             SOURCE
 219                                             SCORE
 220                                             ID
 221                                             ACCESSION
 222                                             DESCRIPTION
 223                                             SEQS
 224                                             FEATURES
 225                                             ANNOTATION
 226                                             SEQ_ANNOTATION
 227                                             CONSENSUS
 228                                             CONSENSUS_META
 229                                             )], @args);
 230   $src && $self->source($src);
 231   defined $score && $self->score($score);
 232   # we need to set up internal hashs first!
 233
 234   $self->{'_seq'} = {};
 235   $self->{'_order'} = {};
 236   $self->{'_start_end_lists'} = {};
 237   $self->{'_dis_name'} = {};
 238   $self->{'_id'} = 'NoName';
 239   # maybe we should automatically read in from args. Hmmm...
 240   $id  && $self->id($id);
 241   $acc && $self->accession($acc);
 242   $desc && $self->description($desc);
 243   $coll && $self->annotation($coll);
 244   # sequence annotation is layered into a provided annotation collection (or dies)
 245   if ($sa) {
 246     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 247                  "with a sequence annotation collection")
 248         if !$coll;
 249     $coll->add_Annotation('seq_annotation', $sa);
 250   }
 251   if ($feats && ref $feats eq 'ARRAY') {
 252     for my $feat (@$feats) {
 253         $self->add_SeqFeature($feat);
 254     }
 255   }
 256   $con && $self->consensus($con);
 257   $cmeta && $self->consensus_meta($cmeta);
 258   # assumes these are in correct alignment order
 259   if ($seqs && ref($seqs) eq 'ARRAY') {
 260     for my $seq (@$seqs) {
 261         $self->add_seq($seq);
 262     }
 263   }
 264
 265   return $self; # success - we hope!
 266 }
 267
 268 =head1 Modifier methods
 269
 270 These methods modify the MSA by adding, removing or shuffling complete
 271 sequences.
 272
 273 =head2 add_seq
 274
 275  Title     : add_seq
 276  Usage     : $myalign->add_seq($newseq);
 277              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 278  Function  : Adds another sequence to the alignment. *Does not* align
 279              it - just adds it to the hashes.
 280              If -ORDER is specified, the sequence is inserted at the
 281              the position spec'd by -ORDER, and existing sequences
 282              are pushed down the storage array.
 283  Returns   : nothing
 284  Args      : A Bio::LocatableSeq object
 285              Positive integer for the sequence position (optional)
 286
 287 See L<Bio::LocatableSeq> for more information
 288
 289 =cut
 290
 291 sub addSeq {
 292     my $self = shift;
 293     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 294     $self->add_seq(@_);
 295 }
 296
 297 sub add_seq {
 298     my $self = shift;
 299     my @args = @_;
 300     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 301     my ($name,$id,$start,$end);
 302
 303     unless ($seq) {
 304         $self->throw("LocatableSeq argument required");
 305     }
 306     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 307         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 308     }
 309     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 310     $order--; # jay's patch (user-specified order is 1-origin)
 311
 312     if ($order < 0) {
 313         $self->throw("User-specified value for ORDER must be >= 1");
 314     }
 315
 316     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 317
 318     # build the symbol list for this sequence,
 319     # will prune out the gap and missing/match chars
 320     # when actually asked for the symbol list in the
 321     # symbol_chars
 322     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 323
 324     $name = $seq->get_nse;
 325
 326     if( $self->{'_seq'}->{$name} ) {
 327         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 328     }
 329     else {
 330         $self->debug( "Assigning $name to $order\n");
 331
 332     my $ordh = $self->{'_order'};
 333     if ($ordh->{$order}) {
 334         # make space to insert
 335         # $c->() returns (in reverse order) the first subsequence
 336         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 337         # (3,2,1), and $c->(2,4,5) returns (2).
 338         my $c;
 339         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 340         map {
 341      $ordh->{$_+1} = $ordh->{$_}
 342         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 343
 344     }
 345     $ordh->{$order} = $name;
 346
 347         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 348             $self->{'_start_end_lists'}->{$id} = [];
 349         }
 350         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 351     }
 352
 353     $self->{'_seq'}->{$name} = $seq;
 354
 355 }
 356
 357
 358 =head2 remove_seq
 359
 360  Title     : remove_seq
 361  Usage     : $aln->remove_seq($seq);
 362  Function  : Removes a single sequence from an alignment
 363  Returns   :
 364  Argument  : a Bio::LocatableSeq object
 365
 366 =cut
 367
 368 sub removeSeq {
 369     my $self = shift;
 370     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 371     $self->remove_seq(@_);
 372 }
 373
 374 sub remove_seq {
 375     my $self = shift;
 376     my $seq = shift;
 377     my ($name,$id,$start,$end);
 378
 379     $self->throw("Need Bio::Locatable seq argument ")
 380         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 381
 382     $id = $seq->id();
 383     $start = $seq->start();
 384     $end  = $seq->end();
 385     $name = sprintf("%s/%d-%d",$id,$start,$end);
 386
 387     if( !exists $self->{'_seq'}->{$name} ) {
 388         $self->throw("Sequence $name does not exist in the alignment to remove!");
 389     }
 390
 391     delete $self->{'_seq'}->{$name};
 392
 393     # we need to remove this seq from the start_end_lists hash
 394
 395     if (exists $self->{'_start_end_lists'}->{$id}) {
 396         # we need to find the sequence in the array.
 397
 398         my ($i, $found);;
 399         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 400             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 401                 $found = 1;
 402                 last;
 403             }
 404         }
 405         if ($found) {
 406             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 407         }
 408         else {
 409             $self->throw("Could not find the sequence to remoce from the start-end list");
 410         }
 411     }
 412     else {
 413         $self->throw("There is no seq list for the name $id");
 414     }
 415     # we need to shift order hash
 416     my %rev_order = reverse %{$self->{'_order'}};
 417     my $no = $rev_order{$name};
 418     my $num_sequences = $self->num_sequences;
 419     for (; $no < $num_sequences; $no++) {
 420        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 421     }
 422     delete $self->{'_order'}->{$no};
 423     return 1;
 424 }
 425
 426
 427 =head2 purge
 428
 429  Title   : purge
 430  Usage   : $aln->purge(0.7);
 431  Function: Removes sequences above given sequence similarity
 432            This function will grind on large alignments. Beware!
 433  Example :
 434  Returns : An array of the removed sequences
 435  Args    : float, threshold for similarity
 436
 437 =cut
 438
 439 sub purge {
 440         my ($self,$perc) = @_;
 441         my (%duplicate, @dups);
 442
 443         my @seqs = $self->each_seq();
 444
 445         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 446                 my $seq = $seqs[$i];
 447
 448                 #skip if already in duplicate hash
 449                 next if exists $duplicate{$seq->display_id} ;
 450                 my $one = $seq->seq();
 451
 452                 my @one = split '', $one;       #split to get 1aa per array element
 453
 454                 for (my $j=$i+1;$j < @seqs;$j++) {
 455                         my $seq2 = $seqs[$j];
 456
 457                         #skip if already in duplicate hash
 458                         next if exists $duplicate{$seq2->display_id} ;
 459
 460                         my $two = $seq2->seq();
 461                         my @two = split '', $two;
 462
 463                         my $count = 0;
 464                         my $res = 0;
 465                         for (my $k=0;$k<@one;$k++) {
 466                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 467                                           $one[$k] eq $two[$k]) {
 468                                         $count++;
 469                                 }
 470                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 471                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 472                                         $res++;
 473                                 }
 474                         }
 475
 476                         my $ratio = 0;
 477                         $ratio = $count/$res unless $res == 0;
 478
 479                         # if above threshold put in duplicate hash and push onto
 480                         # duplicate array for returning to get_unique
 481                         if ( $ratio > $perc ) {
 482                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 483                                 $duplicate{$seq2->display_id} = 1;
 484                                 push @dups, $seq2;
 485                         }
 486                 }
 487         }
 488         foreach my $seq (@dups) {
 489                 $self->remove_seq($seq);
 490         }
 491         return @dups;
 492 }
 493
 494 =head2 sort_alphabetically
 495
 496  Title     : sort_alphabetically
 497  Usage     : $ali->sort_alphabetically
 498  Function  : Changes the order of the alignment to alphabetical on name
 499              followed by numerical by number.
 500  Returns   :
 501  Argument  :
 502
 503 =cut
 504
 505 sub sort_alphabetically {
 506     my $self = shift;
 507     my ($seq,$nse,@arr,%hash,$count);
 508
 509     foreach $seq ( $self->each_seq() ) {
 510         $nse = $seq->get_nse;
 511         $hash{$nse} = $seq;
 512     }
 513
 514     $count = 0;
 515
 516     %{$self->{'_order'}} = (); # reset the hash;
 517
 518     foreach $nse ( sort _alpha_startend keys %hash) {
 519         $self->{'_order'}->{$count} = $nse;
 520
 521         $count++;
 522     }
 523     1;
 524 }
 525
 526 =head2 sort_by_list
 527
 528  Title     : sort_by_list
 529  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 530  Function  : Arbitrarily order sequences in an alignment
 531  Returns   : A new Bio::SimpleAlign object
 532  Argument  : a file listing sequence names in intended order (one name per line)
 533
 534 =cut
 535
 536 sub sort_by_list {
 537     my ($self, $list) = @_;
 538     my (@seq, @ids, %order);
 539
 540     foreach my $seq ( $self->each_seq() ) {
 541         push @seq, $seq;
 542         push @ids, $seq->display_id;
 543     }
 544
 545     my $ct=1;
 546     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 547     while (<$listfh>) {
 548       chomp;
 549       my $name=$_;
 550       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 551       $order{$name}=$ct++;
 552     }
 553     close($listfh);
 554
 555     # use the map-sort-map idiom:
 556     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 557     my $aln = $self->new;
 558     foreach (@sorted) { $aln->add_seq($_) }
 559     return $aln;
 560 }
 561
 562 =head2 set_new_reference
 563
 564  Title     : set_new_reference
 565  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 566              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 567              as the reference (1st) sequence
 568  Function  : Change/Set a new reference (i.e., the first) sequence
 569  Returns   : a new Bio::SimpleAlign object.
 570              Throws an exception if designated sequence not found
 571  Argument  : a positive integer of sequence order, or a sequence name
 572              in the original alignment
 573
 574 =cut
 575
 576 sub set_new_reference {
 577     my ($self, $seqid) = @_;
 578     my $aln = $self->new;
 579     my (@seq, @ids, @new_seq);
 580     my $is_num=0;
 581     foreach my $seq ( $self->each_seq() ) {
 582         push @seq, $seq;
 583         push @ids, $seq->display_id;
 584     }
 585
 586     if ($seqid =~ /^\d+$/) { # argument is seq position
 587         $is_num=1;
 588         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 589     } else { # argument is a seq name
 590         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 591     }
 592
 593     for (my $i=0; $i<=$#seq; $i++) {
 594         my $pos=$i+1;
 595         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 596             unshift @new_seq, $seq[$i];
 597         } else {
 598             push @new_seq, $seq[$i];
 599         }
 600     }
 601     foreach (@new_seq) { $aln->add_seq($_);  }
 602     return $aln;
 603 }
 604
 605 sub _in_aln {  # check if input name exists in the alignment
 606     my ($str, $ref) = @_;
 607     foreach (@$ref) {
 608         return 1 if $str eq $_;
 609     }
 610     return 0;
 611 }
 612
 613
 614 =head2 uniq_seq
 615
 616  Title     : uniq_seq
 617  Usage     : $aln->uniq_seq():  Remove identical sequences in
 618              in the alignment.  Ambiguous base ("N", "n") and
 619              leading and ending gaps ("-") are NOT counted as
 620              differences.
 621  Function  : Make a new alignment of unique sequence types (STs)
 622  Returns   : 1a. if called in a scalar context,
 623                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 624              1b. if called in an array context,
 625                 a new Bio::SimpleAlign object, and a hashref whose keys
 626                 are sequence types, and whose values are arrayrefs to
 627                 lists of sequence ids within the corresponding sequence type
 628              2. if $aln->verbose > 0, ST of each sequence is sent to
 629                 STDERR (in a tabular format)
 630  Argument  : None
 631
 632 =cut
 633
 634 sub uniq_seq {
 635     my ($self, $seqid) = @_;
 636     my $aln = $self->new;
 637     my (%member, %order, @seq, @uniq_str, $st);
 638     my $order=0;
 639     my $len = $self->length();
 640     $st = {};
 641     foreach my $seq ( $self->each_seq() ) {
 642         my $str = $seq->seq();
 643
 644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 645 # comparing two sequence strings
 646
 647     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 648         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 649     # 2nd, convert leading and ending gaps to "?":
 650         $str = &_convert_leading_ending_gaps($str, '-', '?');
 651     # Note that '?' also can mean unknown residue.
 652     # I don't like making global class member changes like this, too
 653     # prone to errors... -- cjfields 08-11-18
 654     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 655         my $new = Bio::LocatableSeq->new(
 656                      -id      => $seq->id(),
 657                                          -alphabet=> $seq->alphabet,
 658                                          -seq     => $str,
 659                                          -start   => $seq->start,
 660                                          -end     => $seq->end
 661                                          );
 662         push @seq, $new;
 663     }
 664
 665     foreach my $seq (@seq) {
 666         my $str = $seq->seq();
 667         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 668         if ($seen) { # seen before
 669             my @memb = @{$member{$key}};
 670             push @memb, $seq;
 671             $member{$key} = \@memb;
 672         } else {  # not seen
 673             push @uniq_str, $key;
 674             $order++;
 675             $member{$key} = [ ($seq) ];
 676             $order{$key} = $order;
 677         }
 678     }
 679
 680     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 682         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 684         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 685         my $gap='-';
 686         my $end= CORE::length($str2);
 687         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 688         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 689                                          -seq  =>$str2,
 690                                          -start=>1,
 691                                          -end  =>$end
 692                                          );
 693         $aln->add_seq($new);
 694         foreach (@{$member{$str}}) {
 695             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 696         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 697         }
 698     }
 699     return wantarray ? ($aln, $st) : $aln;
 700 }
 701
 702 sub _check_uniq {  # check if same seq exists in the alignment
 703     my ($str1, $ref, $length) = @_;
 704     my @char1=split //, $str1;
 705     my @array=@$ref;
 706
 707     return (0, $str1) if @array==0; # not seen (1st sequence)
 708
 709     foreach my $str2 (@array) {
 710         my $diff=0;
 711         my @char2=split //, $str2;
 712         for (my $i=0; $i<=$length-1; $i++) {
 713             next if $char1[$i] eq '?';
 714             next if $char2[$i] eq '?';
 715             $diff++ if $char1[$i] ne $char2[$i];
 716         }
 717         return (1, $str2) if $diff == 0;  # seen before
 718     }
 719
 720     return (0, $str1); # not seen
 721 }
 722
 723 sub _convert_leading_ending_gaps {
 724     my $s=shift;
 725     my $sym1=shift;
 726     my $sym2=shift;
 727     my @array=split //, $s;
 728 # convert leading char:
 729     for (my $i=0; $i<=$#array; $i++) {
 730         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 731     }
 732 # convert ending char:
 733     for (my $i = $#array; $i>= 0; $i--) {
 734         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 735     }
 736     my $s_new=join '', @array;
 737     return $s_new;
 738 }
 739
 740 =head1 Sequence selection methods
 741
 742 Methods returning one or more sequences objects.
 743
 744 =head2 each_seq
 745
 746  Title     : each_seq
 747  Usage     : foreach $seq ( $align->each_seq() )
 748  Function  : Gets a Seq object from the alignment
 749  Returns   : Seq object
 750  Argument  :
 751
 752 =cut
 753
 754 sub eachSeq {
 755     my $self = shift;
 756     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 757     $self->each_seq();
 758 }
 759
 760 sub each_seq {
 761         my $self = shift;
 762         my (@arr,$order);
 763
 764         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 765                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 766                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 767                 }
 768         }
 769         return @arr;
 770 }
 771
 772
 773 =head2 each_alphabetically
 774
 775  Title     : each_alphabetically
 776  Usage     : foreach $seq ( $ali->each_alphabetically() )
 777  Function  : Returns a sequence object, but the objects are returned
 778              in alphabetically sorted order.
 779              Does not change the order of the alignment.
 780  Returns   : Seq object
 781  Argument  :
 782
 783 =cut
 784
 785 sub each_alphabetically {
 786         my $self = shift;
 787         my ($seq,$nse,@arr,%hash,$count);
 788
 789         foreach $seq ( $self->each_seq() ) {
 790                 $nse = $seq->get_nse;
 791                 $hash{$nse} = $seq;
 792         }
 793
 794         foreach $nse ( sort _alpha_startend keys %hash) {
 795                 push(@arr,$hash{$nse});
 796         }
 797         return @arr;
 798 }
 799
 800 sub _alpha_startend {
 801     my ($aname,$astart,$bname,$bstart);
 802     ($aname,$astart) = split (/-/,$a);
 803     ($bname,$bstart) = split (/-/,$b);
 804
 805     if( $aname eq $bname ) {
 806         return $astart <=> $bstart;
 807     }
 808     else {
 809         return $aname cmp $bname;
 810     }
 811 }
 812
 813 =head2 each_seq_with_id
 814
 815  Title     : each_seq_with_id
 816  Usage     : foreach $seq ( $align->each_seq_with_id() )
 817  Function  : Gets a Seq objects from the alignment, the contents
 818              being those sequences with the given name (there may be
 819              more than one)
 820  Returns   : Seq object
 821  Argument  : a seq name
 822
 823 =cut
 824
 825 sub eachSeqWithId {
 826     my $self = shift;
 827     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 828     $self->each_seq_with_id(@_);
 829 }
 830
 831 sub each_seq_with_id {
 832     my $self = shift;
 833     my $id = shift;
 834
 835     $self->throw("Method each_seq_with_id needs a sequence name argument")
 836         unless defined $id;
 837
 838     my (@arr, $seq);
 839
 840     if (exists($self->{'_start_end_lists'}->{$id})) {
 841         @arr = @{$self->{'_start_end_lists'}->{$id}};
 842     }
 843     return @arr;
 844 }
 845
 846 =head2 get_seq_by_pos
 847
 848  Title     : get_seq_by_pos
 849  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 850  Function  : Gets a sequence based on its position in the alignment.
 851              Numbering starts from 1.  Sequence positions larger than
 852              num_sequences() will thow an error.
 853  Returns   : a Bio::LocatableSeq object
 854  Args      : positive integer for the sequence position
 855
 856 =cut
 857
 858 sub get_seq_by_pos {
 859
 860     my $self = shift;
 861     my ($pos) = @_;
 862
 863     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 864         unless $pos =~ /^\d+$/ and $pos > 0;
 865     $self->throw("No sequence at position [$pos]")
 866         unless $pos <= $self->num_sequences ;
 867
 868     my $nse = $self->{'_order'}->{--$pos};
 869     return $self->{'_seq'}->{$nse};
 870 }
 871
 872 =head2 get_seq_by_id
 873
 874  Title     : get_seq_by_id
 875  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 876  Function  : Gets a sequence based on its name.
 877              Sequences that do not exist will warn and return undef
 878  Returns   : a Bio::LocatableSeq object
 879  Args      : string for sequence name
 880
 881 =cut
 882
 883 sub get_seq_by_id {
 884     my ($self,$name) = @_;
 885     unless( defined $name ) {
 886       $self->warn("Must provide a sequence name");
 887       return;
 888     }
 889     for my $seq ( values %{$self->{'_seq'}} ) {
 890       if ( $seq->id eq $name) {
 891         return $seq;
 892       }
 893     }
 894     return;
 895 }
 896
 897 =head2 seq_with_features
 898
 899  Title   : seq_with_features
 900  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 901                                           -consensus => 60
 902                                           -mask =>
 903            sub { my $consensus = shift;
 904
 905                  for my $i (1..5){
 906                     my $n = 'N' x $i;
 907                     my $q = '\?' x $i;
 908                     while($consensus =~ /[^?]$q[^?]/){
 909                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 910                     }
 911                   }
 912                  return $consensus;
 913                }
 914                                          );
 915  Function: produces a Bio::Seq object by first splicing gaps from -pos
 916            (by means of a splice_by_seq_pos() call), then creating
 917            features using non-? chars (by means of a consensus_string()
 918            call with stringency -consensus).
 919  Returns : a Bio::Seq object
 920  Args    : -pos : required. sequence from which to build the Bio::Seq
 921              object
 922            -consensus : optional, defaults to consensus_string()'s
 923              default cutoff value
 924            -mask : optional, a coderef to apply to consensus_string()'s
 925              output before building features.  this may be useful for
 926              closing gaps of 1 bp by masking over them with N, for
 927              instance
 928
 929 =cut
 930
 931 sub seq_with_features{
 932    my ($self,%arg) = @_;
 933
 934    #first do the preparatory splice
 935    $self->throw("must provide a -pos argument") unless $arg{-pos};
 936    $self->splice_by_seq_pos($arg{-pos});
 937
 938    my $consensus_string = $self->consensus_string($arg{-consensus});
 939    $consensus_string = $arg{-mask}->($consensus_string)
 940          if defined($arg{-mask});
 941
 942    my(@bs,@es);
 943
 944    push @bs, 1 if $consensus_string =~ /^[^?]/;
 945
 946    while($consensus_string =~ /\?[^?]/g){
 947          push @bs, pos($consensus_string);
 948    }
 949    while($consensus_string =~ /[^?]\?/g){
 950          push @es, pos($consensus_string);
 951    }
 952
 953    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 954
 955    my $seq = Bio::Seq->new();
 956
 957 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 958 #                -source_tag => 'location',
 959 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 960 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 961 #                                                  );
 962 #   $seq->add_SeqFeature($rootfeature);
 963
 964    while(my $b = shift @bs){
 965          my $e = shift @es;
 966          $seq->add_SeqFeature(
 967        Bio::SeqFeature::Generic->new(
 968          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 969          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 970          -source_tag => $self->source || 'MSA',
 971        )
 972      );
 973    }
 974
 975    return $seq;
 976 }
 977
 978
 979 =head1 Create new alignments
 980
 981 The result of these methods are horizontal or vertical subsets of the
 982 current MSA.
 983
 984 =head2 select
 985
 986  Title     : select
 987  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 988  Function  : Creates a new alignment from a continuous subset of
 989              sequences.  Numbering starts from 1.  Sequence positions
 990              larger than num_sequences() will thow an error.
 991  Returns   : a Bio::SimpleAlign object
 992  Args      : positive integer for the first sequence
 993              positive integer for the last sequence to include (optional)
 994
 995 =cut
 996
 997 sub select {
 998     my $self = shift;
 999     my ($start, $end) = @_;
1000
1001     $self->throw("Select start has to be a positive integer, not [$start]")
1002         unless $start =~ /^\d+$/ and $start > 0;
1003     $self->throw("Select end has to be a positive integer, not [$end]")
1004         unless $end  =~ /^\d+$/ and $end > 0;
1005     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006         unless $start <= $end;
1007
1008     my $aln = $self->new;
1009     foreach my $pos ($start .. $end) {
1010         $aln->add_seq($self->get_seq_by_pos($pos));
1011     }
1012     $aln->id($self->id);
1013     # fix for meta, sf, ann
1014     return $aln;
1015 }
1016
1017 =head2 select_noncont
1018
1019  Title     : select_noncont
1020  Usage     : # 1st and 3rd sequences, sorted
1021              $aln2 = $aln->select_noncont(1, 3)
1022
1023              # 1st and 3rd sequences, sorted (same as first)
1024              $aln2 = $aln->select_noncont(3, 1)
1025
1026              # 1st and 3rd sequences, unsorted
1027              $aln2 = $aln->select_noncont('nosort',3, 1)
1028
1029  Function  : Creates a new alignment from a subset of sequences.  Numbering
1030              starts from 1.  Sequence positions larger than num_sequences() will
1031              throw an error.  Sorts the order added to new alignment by default,
1032              to prevent sorting pass 'nosort' as the first argument in the list.
1033  Returns   : a Bio::SimpleAlign object
1034  Args      : array of integers for the sequences.  If the string 'nosort' is
1035              passed as the first argument, the sequences will not be sorted
1036              in the new alignment but will appear in the order listed.
1037
1038 =cut
1039
1040 sub select_noncont {
1041         my $self = shift;
1042     my $nosort = 0;
1043         my (@pos) = @_;
1044     if ($pos[0] !~ m{^\d+$}) {
1045         my $sortcmd = shift @pos;
1046         if ($sortcmd eq 'nosort') {
1047             $nosort = 1;
1048         } else {
1049             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1050         }
1051     }
1052
1053     my $end = $self->num_sequences;
1054     foreach ( @pos ) {
1055                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1057         }
1058
1059         @pos = sort {$a <=> $b} @pos unless $nosort;
1060
1061         my $aln = $self->new;
1062         foreach my $p (@pos) {
1063                 $aln->add_seq($self->get_seq_by_pos($p));
1064         }
1065         $aln->id($self->id);
1066     # fix for meta, sf, ann
1067         return $aln;
1068 }
1069
1070 =head2 slice
1071
1072  Title     : slice
1073  Usage     : $aln2 = $aln->slice(20,30)
1074  Function  : Creates a slice from the alignment inclusive of start and
1075              end columns, and the first column in the alignment is denoted 1.
1076              Sequences with no residues in the slice are excluded from the
1077              new alignment and a warning is printed. Slice beyond the length of
1078              the sequence does not do padding.
1079  Returns   : A Bio::SimpleAlign object
1080  Args      : Positive integer for start column, positive integer for end column,
1081              optional boolean which if true will keep gap-only columns in the newly
1082              created slice. Example:
1083
1084              $aln2 = $aln->slice(20,30,1)
1085
1086 =cut
1087
1088 sub slice {
1089         my $self = shift;
1090         my ($start, $end, $keep_gap_only) = @_;
1091
1092         $self->throw("Slice start has to be a positive integer, not [$start]")
1093           unless $start =~ /^\d+$/ and $start > 0;
1094         $self->throw("Slice end has to be a positive integer, not [$end]")
1095           unless $end =~ /^\d+$/ and $end > 0;
1096         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097           unless $start <= $end;
1098         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099                                          "[$start] is too big.") if $start > $self->length;
1100     my $cons_meta = $self->consensus_meta;
1101         my $aln = $self->new;
1102         $aln->id($self->id);
1103         foreach my $seq ( $self->each_seq() ) {
1104             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105             Bio::Seq::Meta->new
1106         (-id      => $seq->id,
1107                  -alphabet => $seq->alphabet,
1108                  -strand  => $seq->strand,
1109                  -verbose => $self->verbose) :
1110             Bio::LocatableSeq->new
1111         (-id      => $seq->id,
1112                  -alphabet => $seq->alphabet,
1113                  -strand  => $seq->strand,
1114                  -verbose => $self->verbose);
1115
1116             # seq
1117             my $seq_end = $end;
1118             $seq_end = $seq->length if( $end > $seq->length );
1119
1120             my $slice_seq = $seq->subseq($start, $seq_end);
1121             $new_seq->seq( $slice_seq );
1122
1123             $slice_seq =~ s/\W//g;
1124
1125             if ($start > 1) {
1126             my $pre_start_seq = $seq->subseq(1, $start - 1);
1127             $pre_start_seq =~ s/\W//g;
1128             if (!defined($seq->strand)) {
1129                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1134             }
1135             } else {
1136             if ((defined $seq->strand)&&($seq->strand < 0)){
1137                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138             } else {
1139                $new_seq->start( $seq->start);
1140             }
1141             }
1142         if ($new_seq->isa('Bio::Seq::MetaI')) {
1143             for my $meta_name ($seq->meta_names) {
1144                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145             }
1146         }
1147             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148
1149             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150             $aln->add_seq($new_seq);
1151             } else {
1152             if( $keep_gap_only ) {
1153                 $aln->add_seq($new_seq);
1154             } else {
1155                 my $nse = $seq->get_nse();
1156                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157                     " Sequence excluded from the new alignment.");
1158             }
1159             }
1160         }
1161     if ($cons_meta) {
1162         my $new = Bio::Seq::Meta->new();
1163         for my $meta_name ($cons_meta->meta_names) {
1164             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165         }
1166         $aln->consensus_meta($new);
1167     }
1168     $aln->annotation($self->annotation);
1169     # fix for meta, sf, ann
1170         return $aln;
1171 }
1172
1173 =head2 remove_columns
1174
1175  Title     : remove_columns
1176  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177              $aln2 = $aln->remove_columns([0,0],[6,8])
1178  Function  : Creates an aligment with columns removed corresponding to
1179              the specified type or by specifying the columns by number.
1180  Returns   : Bio::SimpleAlign object
1181  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182              'all_gaps_columns') or array ref where the referenced array
1183              contains a pair of integers that specify a range.
1184              The first column is 0
1185
1186 =cut
1187
1188 sub remove_columns {
1189     my ($self,@args) = @_;
1190     @args || $self->throw("Must supply column ranges or column types");
1191     my $aln;
1192
1193     if ($args[0][0] =~ /^[a-z_]+$/i) {
1194         $aln = $self->_remove_columns_by_type($args[0]);
1195     } elsif ($args[0][0] =~ /^\d+$/) {
1196         $aln = $self->_remove_columns_by_num(\@args);
1197     } else {
1198         $self->throw("You must pass array references to remove_columns(), not @args");
1199     }
1200     # fix for meta, sf, ann
1201     $aln;
1202 }
1203
1204
1205 =head2 remove_gaps
1206
1207  Title     : remove_gaps
1208  Usage     : $aln2 = $aln->remove_gaps
1209  Function  : Creates an aligment with gaps removed
1210  Returns   : a Bio::SimpleAlign object
1211  Args      : a gap character(optional) if none specified taken
1212                 from $self->gap_char,
1213              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214                         indicates that only all-gaps columns should be deleted
1215
1216 Used from method L<remove_columns> in most cases. Set gap character
1217 using L<gap_char()|gap_char>.
1218
1219 =cut
1220
1221 sub remove_gaps {
1222     my ($self,$gapchar,$all_gaps_columns) = @_;
1223     my $gap_line;
1224     if ($all_gaps_columns) {
1225         $gap_line = $self->all_gap_line($gapchar);
1226     } else {
1227         $gap_line = $self->gap_line($gapchar);
1228     }
1229     my $aln = $self->new;
1230
1231     my @remove;
1232     my $length = 0;
1233     my $del_char = $gapchar || $self->gap_char;
1234     # Do the matching to get the segments to remove
1235     while ($gap_line =~ m/[$del_char]/g) {
1236         my $start = pos($gap_line)-1;
1237         $gap_line=~/\G[$del_char]+/gc;
1238         my $end = pos($gap_line)-1;
1239
1240         #have to offset the start and end for subsequent removes
1241         $start-=$length;
1242         $end  -=$length;
1243         $length += ($end-$start+1);
1244         push @remove, [$start,$end];
1245     }
1246
1247     #remove the segments
1248     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249     # fix for meta, sf, ann
1250     return $aln;
1251 }
1252
1253
1254 sub _remove_col {
1255     my ($self,$aln,$remove) = @_;
1256     my @new;
1257
1258     my $gap = $self->gap_char;
1259
1260     # splice out the segments and create new seq
1261     foreach my $seq($self->each_seq){
1262         my $new_seq = Bio::LocatableSeq->new(
1263                                              -id      => $seq->id,
1264                                              -alphabet=> $seq->alphabet,
1265                                              -strand  => $seq->strand,
1266                                              -verbose => $self->verbose);
1267         my $sequence = $seq->seq;
1268         foreach my $pair(@{$remove}){
1269             my $start = $pair->[0];
1270             my $end   = $pair->[1];
1271             $sequence = $seq->seq unless $sequence;
1272             my $orig = $sequence;
1273             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1274             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275             $sequence = $head.$tail;
1276             # start
1277             unless (defined $new_seq->start) {
1278                 if ($start == 0) {
1279                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1281                 }
1282                 else {
1283                     my $start_adjust = $orig =~ /^$gap+/;
1284                     if ($start_adjust) {
1285                         $start_adjust = $+[0] == $start;
1286                     }
1287                     $new_seq->start($seq->start + $start_adjust);
1288                 }
1289             }
1290             # end
1291             if (($end + 1) >= CORE::length($orig)) {
1292                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294             }
1295             else {
1296                 $new_seq->end($seq->end);
1297             }
1298         }
1299
1300         if ($new_seq->end < $new_seq->start) {
1301             # we removed all columns except for gaps: set to 0 to indicate no
1302             # sequence
1303             $new_seq->start(0);
1304             $new_seq->end(0);
1305         }
1306
1307         $new_seq->seq($sequence) if $sequence;
1308                 push @new, $new_seq;
1309     }
1310     # add the new seqs to the alignment
1311     foreach my $new(@new){
1312         $aln->add_seq($new);
1313     }
1314     # fix for meta, sf, ann
1315     return $aln;
1316 }
1317
1318 sub _remove_columns_by_type {
1319         my ($self,$type) = @_;
1320         my $aln = $self->new;
1321         my @remove;
1322
1323         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1324         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1325         my %matchchars = ( 'match'           => '\*',
1326                        'weak'             => '\.',
1327                        'strong'           => ':',
1328                        'mismatch'         => ' ',
1329                        'gaps'             => '',
1330                        'all_gaps_columns' => ''
1331                      );
1332         # get the characters to delete against
1333         my $del_char;
1334         foreach my $type (@{$type}){
1335                 $del_char.= $matchchars{$type};
1336         }
1337
1338         my $length = 0;
1339         my $match_line = $self->match_line;
1340         # do the matching to get the segments to remove
1341         if($del_char){
1342                 while($match_line =~ m/[$del_char]/g ){
1343                         my $start = pos($match_line)-1;
1344                         $match_line=~/\G[$del_char]+/gc;
1345                         my $end = pos($match_line)-1;
1346
1347                         #have to offset the start and end for subsequent removes
1348                         $start-=$length;
1349                         $end  -=$length;
1350                         $length += ($end-$start+1);
1351                         push @remove, [$start,$end];
1352                 }
1353         }
1354
1355         # remove the segments
1356         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357         $aln = $aln->remove_gaps() if $gap;
1358         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359     # fix for meta, sf, ann
1360         $aln;
1361 }
1362
1363
1364 sub _remove_columns_by_num {
1365         my ($self,$positions) = @_;
1366         my $aln = $self->new;
1367
1368         # sort the positions
1369         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1370
1371     my @remove;
1372     my $length = 0;
1373     foreach my $pos (@{$positions}) {
1374         my ($start, $end) = @{$pos};
1375
1376         #have to offset the start and end for subsequent removes
1377         $start-=$length;
1378         $end  -=$length;
1379         $length += ($end-$start+1);
1380         push @remove, [$start,$end];
1381     }
1382
1383     #remove the segments
1384     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385     # fix for meta, sf, ann
1386         $aln;
1387 }
1388
1389
1390 =head1 Change sequences within the MSA
1391
1392 These methods affect characters in all sequences without changing the
1393 alignment.
1394
1395 =head2 splice_by_seq_pos
1396
1397  Title   : splice_by_seq_pos
1398  Usage   : $status = splice_by_seq_pos(1);
1399  Function: splices all aligned sequences where the specified sequence
1400            has gaps.
1401  Example :
1402  Returns : 1 on success
1403  Args    : position of sequence to splice by
1404
1405
1406 =cut
1407
1408 sub splice_by_seq_pos{
1409   my ($self,$pos) = @_;
1410
1411   my $guide = $self->get_seq_by_pos($pos);
1412   my $guide_seq = $guide->seq;
1413
1414   $guide_seq =~ s/\./\-/g;
1415
1416   my @gaps = ();
1417   $pos = -1;
1418   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419     unshift @gaps, $pos;
1420     $pos++;
1421   }
1422
1423   foreach my $seq ($self->each_seq){
1424     my @bases = split '', $seq->seq;
1425
1426     splice(@bases, $_, 1) foreach @gaps;
1427     $seq->seq(join('', @bases));
1428   }
1429
1430   1;
1431 }
1432
1433 =head2 map_chars
1434
1435  Title     : map_chars
1436  Usage     : $ali->map_chars('\.','-')
1437  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438              characters
1439
1440              Notice that the from (arg1) is interpretted as a regex,
1441              so be careful about quoting meta characters (eg
1442              $ali->map_chars('.','-') wont do what you want)
1443  Returns   :
1444  Argument  : 'from' rexexp
1445              'to' string
1446
1447 =cut
1448
1449 sub map_chars {
1450     my $self = shift;
1451     my $from = shift;
1452     my $to   = shift;
1453     my ($seq,$temp);
1454
1455     $self->throw("Need exactly two arguments")
1456         unless defined $from and defined $to;
1457
1458     foreach $seq ( $self->each_seq() ) {
1459         $temp = $seq->seq();
1460         $temp =~ s/$from/$to/g;
1461         $seq->seq($temp);
1462     }
1463     return 1;
1464 }
1465
1466
1467 =head2 uppercase
1468
1469  Title     : uppercase()
1470  Usage     : $ali->uppercase()
1471  Function  : Sets all the sequences to uppercase
1472  Returns   :
1473  Argument  :
1474
1475 =cut
1476
1477 sub uppercase {
1478     my $self = shift;
1479     my $seq;
1480     my $temp;
1481
1482     foreach $seq ( $self->each_seq() ) {
1483       $temp = $seq->seq();
1484       $temp =~ tr/[a-z]/[A-Z]/;
1485
1486       $seq->seq($temp);
1487     }
1488     return 1;
1489 }
1490
1491 =head2 cigar_line
1492
1493  Title    : cigar_line()
1494  Usage    : %cigars = $align->cigar_line()
1495  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1496             Report) line for each sequence in the alignment. Examples are
1497             "1,60" or "5,10:12,58", where the numbers refer to conserved
1498             positions within the alignment. The keys of the hash are the
1499             NSEs (name/start/end) assigned to each sequence.
1500  Args     : threshold (optional, defaults to 100)
1501  Returns  : Hash of strings (cigar lines)
1502
1503 =cut
1504
1505 sub cigar_line {
1506         my $self = shift;
1507         my $thr=shift||100;
1508         my %cigars;
1509
1510         my @consensus = split "",($self->consensus_string($thr));
1511         my $len = $self->length;
1512         my $gapchar = $self->gap_char;
1513
1514         # create a precursor, something like (1,4,5,6,7,33,45),
1515         # where each number corresponds to a conserved position
1516         foreach my $seq ( $self->each_seq ) {
1517                 my @seq = split "", uc ($seq->seq);
1518                 my $pos = 1;
1519                 for (my $x = 0 ; $x < $len ; $x++ ) {
1520                         if ($seq[$x] eq $consensus[$x]) {
1521                                 push @{$cigars{$seq->get_nse}},$pos;
1522                                 $pos++;
1523                         } elsif ($seq[$x] ne $gapchar) {
1524                                 $pos++;
1525                         }
1526                 }
1527         }
1528         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1529         for my $name (keys %cigars) {
1530                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1531                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1532       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1533            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1534                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1535                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1536                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1537                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1538                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1539                         }
1540       }
1541         }
1542   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1543   for my $name (keys %cigars) {
1544           my @remove;
1545           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1546                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1547                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1548                       unshift @remove,$x;
1549               }
1550            }
1551       for my $pos (@remove) {
1552                         splice @{$cigars{$name}}, $pos, 1;
1553            }
1554    }
1555    # join and punctuate
1556    for my $name (keys %cigars) {
1557           my ($start,$end,$str) = "";
1558           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1559                   $str .= ($start . "," . $end . ":");
1560           }
1561           $str =~ s/:$//;
1562       $cigars{$name} = $str;
1563    }
1564    %cigars;
1565 }
1566
1567
1568 =head2 match_line
1569
1570  Title    : match_line()
1571  Usage    : $line = $align->match_line()
1572  Function : Generates a match line - much like consensus string
1573             except that a line indicating the '*' for a match.
1574  Args     : (optional) Match line characters ('*' by default)
1575             (optional) Strong match char (':' by default)
1576             (optional) Weak match char ('.' by default)
1577  Returns  : String
1578
1579 =cut
1580
1581 sub match_line {
1582         my ($self,$matchlinechar, $strong, $weak) = @_;
1583         my %matchchars = ('match'    => $matchlinechar || '*',
1584                                                           'weak'     => $weak          || '.',
1585                                                           'strong'   => $strong        || ':',
1586                                                           'mismatch' => ' ',
1587                                                   );
1588
1589         my @seqchars;
1590         my $alphabet;
1591         foreach my $seq ( $self->each_seq ) {
1592                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1593                 $alphabet = $seq->alphabet unless defined $alphabet;
1594         }
1595         my $refseq = shift @seqchars;
1596         # let's just march down the columns
1597         my $matchline;
1598  POS:
1599         foreach my $pos ( 0..$self->length ) {
1600                 my $refchar = $refseq->[$pos];
1601                 my $char = $matchchars{'mismatch'};
1602                 unless( defined $refchar ) {
1603                         last if $pos == $self->length; # short circuit on last residue
1604                         # this in place to handle jason's soon-to-be-committed
1605                         # intron mapping code
1606                         goto bottom;
1607                 }
1608                 my %col = ($refchar => 1);
1609                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1610                 foreach my $seq ( @seqchars ) {
1611                         next if $pos >= scalar @$seq;
1612                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1613                                                           $seq->[$pos] eq ' ' );
1614                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1615                 }
1616                 my @colresidues = sort keys %col;
1617
1618                 # if all the values are the same
1619                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1620                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1621                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1622                         # matches for protein seqs
1623             TYPE:
1624                         foreach my $type ( qw(strong weak) ) {
1625                                 # iterate through categories
1626                                 my %groups;
1627                                 # iterate through each of the aa in the col
1628                                 # look to see which groups it is in
1629                                 foreach my $c ( @colresidues ) {
1630                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1631                                                 push @{$groups{$f}},$c;
1632                                         }
1633                                 }
1634                          GRP:
1635                                 foreach my $cols ( values %groups ) {
1636                                         @$cols = sort @$cols;
1637                                         # now we are just testing to see if two arrays
1638                                         # are identical w/o changing either one
1639                                         # have to be same len
1640                                         next if( scalar @$cols != scalar @colresidues );
1641                                         # walk down the length and check each slot
1642                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1643                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1644                                         }
1645                                         $char = $matchchars{$type};
1646                                         last TYPE;
1647                                 }
1648                         }
1649                 }
1650          bottom:
1651                 $matchline .= $char;
1652         }
1653         return $matchline;
1654 }
1655
1656
1657 =head2 gap_line
1658
1659  Title    : gap_line()
1660  Usage    : $line = $align->gap_line()
1661  Function : Generates a gap line - much like consensus string
1662             except that a line where '-' represents gap
1663  Args     : (optional) gap line characters ('-' by default)
1664  Returns  : string
1665
1666 =cut
1667
1668 sub gap_line {
1669     my ($self,$gapchar) = @_;
1670     $gapchar = $gapchar || $self->gap_char;
1671     my %gap_hsh; # column gaps vector
1672     foreach my $seq ( $self->each_seq ) {
1673                 my $i = 0;
1674         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1675                   map {[$i++, $_]} split(//, uc ($seq->seq));
1676     }
1677     my $gap_line;
1678     foreach my $pos ( 0..$self->length-1 ) {
1679           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1680     }
1681     return $gap_line;
1682 }
1683
1684 =head2 all_gap_line
1685
1686  Title    : all_gap_line()
1687  Usage    : $line = $align->all_gap_line()
1688  Function : Generates a gap line - much like consensus string
1689             except that a line where '-' represents all-gap column
1690  Args     : (optional) gap line characters ('-' by default)
1691  Returns  : string
1692
1693 =cut
1694
1695 sub all_gap_line {
1696     my ($self,$gapchar) = @_;
1697     $gapchar = $gapchar || $self->gap_char;
1698     my %gap_hsh;                # column gaps counter hash
1699     my @seqs = $self->each_seq;
1700     foreach my $seq ( @seqs ) {
1701         my $i = 0;
1702         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1703         map {[$i++, $_]} split(//, uc ($seq->seq));
1704     }
1705     my $gap_line;
1706     foreach my $pos ( 0..$self->length-1 ) {
1707         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1708             # gaps column
1709             $gap_line .= $gapchar;
1710         } else {
1711             $gap_line .= '.';
1712         }
1713     }
1714     return $gap_line;
1715 }
1716
1717 =head2 gap_col_matrix
1718
1719  Title    : gap_col_matrix()
1720  Usage    : my $cols = $align->gap_col_matrix()
1721  Function : Generates an array of hashes where
1722             each entry in the array is a hash reference
1723             with keys of all the sequence names and
1724             and value of 1 or 0 if the sequence has a gap at that column
1725  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1726
1727 =cut
1728
1729 sub gap_col_matrix {
1730     my ($self,$gapchar) = @_;
1731     $gapchar = $gapchar || $self->gap_char;
1732     my %gap_hsh; # column gaps vector
1733     my @cols;
1734     foreach my $seq ( $self->each_seq ) {
1735         my $i = 0;
1736         my $str = $seq->seq;
1737         my $len = $seq->length;
1738         my $ch;
1739         my $id = $seq->display_id;
1740         while( $i < $len ) {
1741             $ch = substr($str, $i, 1);
1742             $cols[$i++]->{$id} = ($ch eq $gapchar);
1743         }
1744     }
1745     return \@cols;
1746 }
1747
1748 =head2 match
1749
1750  Title     : match()
1751  Usage     : $ali->match()
1752  Function  : Goes through all columns and changes residues that are
1753              identical to residue in first sequence to match '.'
1754              character. Sets match_char.
1755
1756              USE WITH CARE: Most MSA formats do not support match
1757              characters in sequences, so this is mostly for output
1758              only. NEXUS format (Bio::AlignIO::nexus) can handle
1759              it.
1760  Returns   : 1
1761  Argument  : a match character, optional, defaults to '.'
1762
1763 =cut
1764
1765 sub match {
1766     my ($self, $match) = @_;
1767
1768     $match ||= '.';
1769     my ($matching_char) = $match;
1770     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1771     $self->map_chars($matching_char, '-');
1772
1773     my @seqs = $self->each_seq();
1774     return 1 unless scalar @seqs > 1;
1775
1776     my $refseq = shift @seqs ;
1777     my @refseq = split //, $refseq->seq;
1778     my $gapchar = $self->gap_char;
1779
1780     foreach my $seq ( @seqs ) {
1781         my @varseq = split //, $seq->seq();
1782         for ( my $i=0; $i < scalar @varseq; $i++) {
1783             $varseq[$i] = $match if defined $refseq[$i] &&
1784                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1785                   $refseq[$i] =~ /$gapchar/ )
1786                       && $refseq[$i] eq $varseq[$i];
1787         }
1788         $seq->seq(join '', @varseq);
1789     }
1790     $self->match_char($match);
1791     return 1;
1792 }
1793
1794
1795 =head2 unmatch
1796
1797  Title     : unmatch()
1798  Usage     : $ali->unmatch()
1799  Function  : Undoes the effect of method match. Unsets match_char.
1800  Returns   : 1
1801  Argument  : a match character, optional, defaults to '.'
1802
1803 See L<match> and L<match_char>
1804
1805 =cut
1806
1807 sub unmatch {
1808     my ($self, $match) = @_;
1809
1810     $match ||= '.';
1811
1812     my @seqs = $self->each_seq();
1813     return 1 unless scalar @seqs > 1;
1814
1815     my $refseq = shift @seqs ;
1816     my @refseq = split //, $refseq->seq;
1817     my $gapchar = $self->gap_char;
1818     foreach my $seq ( @seqs ) {
1819         my @varseq = split //, $seq->seq();
1820         for ( my $i=0; $i < scalar @varseq; $i++) {
1821             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1822                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1823                   $refseq[$i] =~ /$gapchar/ ) &&
1824                       $varseq[$i] eq $match;
1825         }
1826         $seq->seq(join '', @varseq);
1827     }
1828     $self->match_char('');
1829     return 1;
1830 }
1831
1832 =head1 MSA attributes
1833
1834 Methods for setting and reading the MSA attributes.
1835
1836 Note that the methods defining character semantics depend on the user
1837 to set them sensibly.  They are needed only by certain input/output
1838 methods. Unset them by setting to an empty string ('').
1839
1840 =head2 id
1841
1842  Title     : id
1843  Usage     : $myalign->id("Ig")
1844  Function  : Gets/sets the id field of the alignment
1845  Returns   : An id string
1846  Argument  : An id string (optional)
1847
1848 =cut
1849
1850 sub id {
1851     my ($self, $name) = @_;
1852
1853     if (defined( $name )) {
1854         $self->{'_id'} = $name;
1855     }
1856
1857     return $self->{'_id'};
1858 }
1859
1860 =head2 accession
1861
1862  Title     : accession
1863  Usage     : $myalign->accession("PF00244")
1864  Function  : Gets/sets the accession field of the alignment
1865  Returns   : An acc string
1866  Argument  : An acc string (optional)
1867
1868 =cut
1869
1870 sub accession {
1871     my ($self, $acc) = @_;
1872
1873     if (defined( $acc )) {
1874         $self->{'_accession'} = $acc;
1875     }
1876
1877     return $self->{'_accession'};
1878 }
1879
1880 =head2 description
1881
1882  Title     : description
1883  Usage     : $myalign->description("14-3-3 proteins")
1884  Function  : Gets/sets the description field of the alignment
1885  Returns   : An description string
1886  Argument  : An description string (optional)
1887
1888 =cut
1889
1890 sub description {
1891     my ($self, $name) = @_;
1892
1893     if (defined( $name )) {
1894         $self->{'_description'} = $name;
1895     }
1896
1897     return $self->{'_description'};
1898 }
1899
1900 =head2 missing_char
1901
1902  Title     : missing_char
1903  Usage     : $myalign->missing_char("?")
1904  Function  : Gets/sets the missing_char attribute of the alignment
1905              It is generally recommended to set it to 'n' or 'N'
1906              for nucleotides and to 'X' for protein.
1907  Returns   : An missing_char string,
1908  Argument  : An missing_char string (optional)
1909
1910 =cut
1911
1912 sub missing_char {
1913     my ($self, $char) = @_;
1914
1915     if (defined $char ) {
1916         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1917         $self->{'_missing_char'} = $char;
1918     }
1919
1920     return $self->{'_missing_char'};
1921 }
1922
1923 =head2 match_char
1924
1925  Title     : match_char
1926  Usage     : $myalign->match_char('.')
1927  Function  : Gets/sets the match_char attribute of the alignment
1928  Returns   : An match_char string,
1929  Argument  : An match_char string (optional)
1930
1931 =cut
1932
1933 sub match_char {
1934     my ($self, $char) = @_;
1935
1936     if (defined $char ) {
1937         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1938         $self->{'_match_char'} = $char;
1939     }
1940
1941     return $self->{'_match_char'};
1942 }
1943
1944 =head2 gap_char
1945
1946  Title     : gap_char
1947  Usage     : $myalign->gap_char('-')
1948  Function  : Gets/sets the gap_char attribute of the alignment
1949  Returns   : An gap_char string, defaults to '-'
1950  Argument  : An gap_char string (optional)
1951
1952 =cut
1953
1954 sub gap_char {
1955     my ($self, $char) = @_;
1956
1957     if (defined $char || ! defined $self->{'_gap_char'} ) {
1958         $char= '-' unless defined $char;
1959         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1960         $self->{'_gap_char'} = $char;
1961     }
1962     return $self->{'_gap_char'};
1963 }
1964
1965 =head2 symbol_chars
1966
1967  Title   : symbol_chars
1968  Usage   : my @symbolchars = $aln->symbol_chars;
1969  Function: Returns all the seen symbols (other than gaps)
1970  Returns : array of characters that are the seen symbols
1971  Args    : boolean to include the gap/missing/match characters
1972
1973 =cut
1974
1975 sub symbol_chars{
1976    my ($self,$includeextra) = @_;
1977
1978    unless ($self->{'_symbols'}) {
1979        foreach my $seq ($self->each_seq) {
1980            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1981        }
1982    }
1983    my %copy = %{$self->{'_symbols'}};
1984    if( ! $includeextra ) {
1985        foreach my $char ( $self->gap_char, $self->match_char,
1986                           $self->missing_char) {
1987            delete $copy{$char} if( defined $char );
1988        }
1989    }
1990    return keys %copy;
1991 }
1992
1993 =head1 Alignment descriptors
1994
1995 These read only methods describe the MSA in various ways.
1996
1997
1998 =head2 score
1999
2000  Title     : score
2001  Usage     : $str = $ali->score()
2002  Function  : get/set a score of the alignment
2003  Returns   : a score for the alignment
2004  Argument  : an optional score to set
2005
2006 =cut
2007
2008 sub score {
2009   my $self = shift;
2010   $self->{score} = shift if @_;
2011   return $self->{score};
2012 }
2013
2014 =head2 consensus_string
2015
2016  Title     : consensus_string
2017  Usage     : $str = $ali->consensus_string($threshold_percent)
2018  Function  : Makes a strict consensus
2019  Returns   : Consensus string
2020  Argument  : Optional treshold ranging from 0 to 100.
2021              The consensus residue has to appear at least threshold %
2022              of the sequences at a given location, otherwise a '?'
2023              character will be placed at that location.
2024              (Default value = 0%)
2025
2026 =cut
2027
2028 sub consensus_string {
2029     my $self = shift;
2030     my $threshold = shift;
2031
2032     my $out = "";
2033     my $len = $self->length - 1;
2034
2035     foreach ( 0 .. $len ) {
2036         $out .= $self->_consensus_aa($_,$threshold);
2037     }
2038     return $out;
2039 }
2040
2041 sub _consensus_aa {
2042     my $self = shift;
2043     my $point = shift;
2044     my $threshold_percent = shift || -1 ;
2045     my ($seq,%hash,$count,$letter,$key);
2046     my $gapchar = $self->gap_char;
2047     foreach $seq ( $self->each_seq() ) {
2048         $letter = substr($seq->seq,$point,1);
2049         $self->throw("--$point-----------") if $letter eq '';
2050         ($letter eq $gapchar || $letter =~ /\./) && next;
2051         # print "Looking at $letter\n";
2052         $hash{$letter}++;
2053     }
2054     my $number_of_sequences = $self->num_sequences();
2055     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2056     $count = -1;
2057     $letter = '?';
2058
2059     foreach $key ( sort keys %hash ) {
2060         # print "Now at $key $hash{$key}\n";
2061         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2062             $letter = $key;
2063             $count = $hash{$key};
2064         }
2065     }
2066     return $letter;
2067 }
2068
2069
2070 =head2 consensus_iupac
2071
2072  Title     : consensus_iupac
2073  Usage     : $str = $ali->consensus_iupac()
2074  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2075              and RNA. The output is in upper case except when gaps in
2076              a column force output to be in lower case.
2077
2078              Note that if your alignment sequences contain a lot of
2079              IUPAC ambiquity codes you often have to manually set
2080              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2081              indicate a protein sequence.
2082  Returns   : consensus string
2083  Argument  : none
2084  Throws    : on protein sequences
2085
2086 =cut
2087
2088 sub consensus_iupac {
2089     my $self = shift;
2090     my $out = "";
2091     my $len = $self->length-1;
2092
2093     # only DNA and RNA sequences are valid
2094     foreach my $seq ( $self->each_seq() ) {
2095         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2096             if $seq->alphabet eq 'protein';
2097     }
2098     # loop over the alignment columns
2099     foreach my $count ( 0 .. $len ) {
2100         $out .= $self->_consensus_iupac($count);
2101     }
2102     return $out;
2103 }
2104
2105 sub _consensus_iupac {
2106     my ($self, $column) = @_;
2107     my ($string, $char, $rna);
2108
2109     #determine all residues in a column
2110     foreach my $seq ( $self->each_seq() ) {
2111         $string .= substr($seq->seq, $column, 1);
2112     }
2113     $string = uc $string;
2114
2115     # quick exit if there's an N in the string
2116     if ($string =~ /N/) {
2117         $string =~ /\W/ ? return 'n' : return 'N';
2118     }
2119     # ... or if there are only gap characters
2120     return '-' if $string =~ /^\W+$/;
2121
2122     # treat RNA as DNA in regexps
2123     if ($string =~ /U/) {
2124         $string =~ s/U/T/;
2125         $rna = 1;
2126     }
2127
2128     # the following s///'s only need to be done to the _first_ ambiguity code
2129     # as we only need to see the _range_ of characters in $string
2130
2131     if ($string =~ /[VDHB]/) {
2132         $string =~ s/V/AGC/;
2133         $string =~ s/D/AGT/;
2134         $string =~ s/H/ACT/;
2135         $string =~ s/B/CTG/;
2136     }
2137
2138     if ($string =~ /[SKYRWM]/) {
2139         $string =~ s/S/GC/;
2140         $string =~ s/K/GT/;
2141         $string =~ s/Y/CT/;
2142         $string =~ s/R/AG/;
2143         $string =~ s/W/AT/;
2144         $string =~ s/M/AC/;
2145     }
2146
2147     # and now the guts of the thing
2148
2149     if ($string =~ /A/) {
2150         $char = 'A';                     # A                      A
2151         if ($string =~ /G/) {
2152             $char = 'R';                 # A and G (purines)      R
2153             if ($string =~ /C/) {
2154                 $char = 'V';             # A and G and C          V
2155                 if ($string =~ /T/) {
2156                     $char = 'N';         # A and G and C and T    N
2157                 }
2158             } elsif ($string =~ /T/) {
2159                 $char = 'D';             # A and G and T          D
2160             }
2161         } elsif ($string =~ /C/) {
2162             $char = 'M';                 # A and C                M
2163             if ($string =~ /T/) {
2164                 $char = 'H';             # A and C and T          H
2165             }
2166         } elsif ($string =~ /T/) {
2167             $char = 'W';                 # A and T                W
2168         }
2169     } elsif ($string =~ /C/) {
2170         $char = 'C';                     # C                      C
2171         if ($string =~ /T/) {
2172             $char = 'Y';                 # C and T (pyrimidines)  Y
2173             if ($string =~ /G/) {
2174                 $char = 'B';             # C and T and G          B
2175             }
2176         } elsif ($string =~ /G/) {
2177             $char = 'S';                 # C and G                S
2178         }
2179     } elsif ($string =~ /G/) {
2180         $char = 'G';                     # G                      G
2181         if ($string =~ /C/) {
2182             $char = 'S';                 # G and C                S
2183         } elsif ($string =~ /T/) {
2184             $char = 'K';                 # G and T                K
2185         }
2186     } elsif ($string =~ /T/) {
2187         $char = 'T';                     # T                      T
2188     }
2189
2190     $char = 'U' if $rna and $char eq 'T';
2191     $char = lc $char if $string =~ /\W/;
2192
2193     return $char;
2194 }
2195
2196
2197 =head2 consensus_meta
2198
2199  Title     : consensus_meta
2200  Usage     : $seqmeta = $ali->consensus_meta()
2201  Function  : Returns a Bio::Seq::Meta object containing the consensus
2202              strings derived from meta data analysis.
2203  Returns   : Bio::Seq::Meta
2204  Argument  : Bio::Seq::Meta
2205  Throws    : non-MetaI object
2206
2207 =cut
2208
2209 sub consensus_meta {
2210     my ($self, $meta) = @_;
2211     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2212         $self->throw('Not a Bio::Seq::MetaI object');
2213     }
2214     return $self->{'_aln_meta'} = $meta if $meta;
2215     return $self->{'_aln_meta'}
2216 }
2217
2218 =head2 is_flush
2219
2220  Title     : is_flush
2221  Usage     : if ( $ali->is_flush() )
2222  Function  : Tells you whether the alignment
2223            : is flush, i.e. all of the same length
2224  Returns   : 1 or 0
2225  Argument  :
2226
2227 =cut
2228
2229 sub is_flush {
2230     my ($self,$report) = @_;
2231     my $seq;
2232     my $length = (-1);
2233     my $temp;
2234
2235     foreach $seq ( $self->each_seq() ) {
2236         if( $length == (-1) ) {
2237             $length = CORE::length($seq->seq());
2238             next;
2239         }
2240
2241         $temp = CORE::length($seq->seq());
2242         if( $temp != $length ) {
2243             $self->warn("expecting $length not $temp from ".
2244                         $seq->display_id) if( $report );
2245             $self->debug("expecting $length not $temp from ".
2246                          $seq->display_id);
2247             $self->debug($seq->seq(). "\n");
2248             return 0;
2249         }
2250     }
2251
2252     return 1;
2253 }
2254
2255
2256 =head2 length
2257
2258  Title     : length()
2259  Usage     : $len = $ali->length()
2260  Function  : Returns the maximum length of the alignment.
2261              To be sure the alignment is a block, use is_flush
2262  Returns   : Integer
2263  Argument  :
2264
2265 =cut
2266
2267 sub length_aln {
2268     my $self = shift;
2269     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2270     $self->length(@_);
2271 }
2272
2273 sub length {
2274     my $self = shift;
2275     my $seq;
2276     my $length = -1;
2277     my $temp;
2278
2279     foreach $seq ( $self->each_seq() ) {
2280         $temp = $seq->length();
2281         if( $temp > $length ) {
2282             $length = $temp;
2283         }
2284     }
2285
2286     return $length;
2287 }
2288
2289
2290 =head2 maxdisplayname_length
2291
2292  Title     : maxdisplayname_length
2293  Usage     : $ali->maxdisplayname_length()
2294  Function  : Gets the maximum length of the displayname in the
2295              alignment. Used in writing out various MSA formats.
2296  Returns   : integer
2297  Argument  :
2298
2299 =cut
2300
2301 sub maxname_length {
2302     my $self = shift;
2303     $self->deprecated("maxname_length - deprecated method.".
2304                       " Use maxdisplayname_length() instead.");
2305     $self->maxdisplayname_length();
2306 }
2307
2308 sub maxnse_length {
2309     my $self = shift;
2310     $self->deprecated("maxnse_length - deprecated method.".
2311                       " Use maxnse_length() instead.");
2312     $self->maxdisplayname_length();
2313 }
2314
2315 sub maxdisplayname_length {
2316     my $self = shift;
2317     my $maxname = (-1);
2318     my ($seq,$len);
2319
2320     foreach $seq ( $self->each_seq() ) {
2321         $len = CORE::length $self->displayname($seq->get_nse());
2322
2323         if( $len > $maxname ) {
2324             $maxname = $len;
2325         }
2326     }
2327
2328     return $maxname;
2329 }
2330
2331 =head2 max_metaname_length
2332
2333  Title     : max_metaname_length
2334  Usage     : $ali->max_metaname_length()
2335  Function  : Gets the maximum length of the meta name tags in the
2336              alignment for the sequences and for the alignment.
2337              Used in writing out various MSA formats.
2338  Returns   : integer
2339  Argument  : None
2340
2341 =cut
2342
2343 sub max_metaname_length {
2344     my $self = shift;
2345     my $maxname = (-1);
2346     my ($seq,$len);
2347
2348     # check seq meta first
2349     for $seq ( $self->each_seq() ) {
2350         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2351         for my $mtag ($seq->meta_names) {
2352             $len = CORE::length $mtag;
2353             if( $len > $maxname ) {
2354                 $maxname = $len;
2355             }
2356         }
2357     }
2358
2359     # alignment meta
2360     for my $meta ($self->consensus_meta) {
2361         next unless $meta;
2362         for my $name ($meta->meta_names) {
2363             $len = CORE::length $name;
2364             if( $len > $maxname ) {
2365                 $maxname = $len;
2366             }
2367         }
2368     }
2369
2370     return $maxname;
2371 }
2372
2373 =head2 num_residues
2374
2375  Title     : num_residues
2376  Usage     : $no = $ali->num_residues
2377  Function  : number of residues in total in the alignment
2378  Returns   : integer
2379  Argument  :
2380  Note      : replaces no_residues()
2381
2382 =cut
2383
2384 sub num_residues {
2385     my $self = shift;
2386     my $count = 0;
2387
2388     foreach my $seq ($self->each_seq) {
2389         my $str = $seq->seq();
2390
2391         $count += ($str =~ s/[A-Za-z]//g);
2392     }
2393
2394     return $count;
2395 }
2396
2397 =head2 num_sequences
2398
2399  Title     : num_sequences
2400  Usage     : $depth = $ali->num_sequences
2401  Function  : number of sequence in the sequence alignment
2402  Returns   : integer
2403  Argument  : none
2404  Note      : replaces no_sequences()
2405
2406 =cut
2407
2408 sub num_sequences {
2409     my $self = shift;
2410     return scalar($self->each_seq);
2411 }
2412
2413
2414 =head2 average_percentage_identity
2415
2416  Title   : average_percentage_identity
2417  Usage   : $id = $align->average_percentage_identity
2418  Function: The function uses a fast method to calculate the average
2419            percentage identity of the alignment
2420  Returns : The average percentage identity of the alignment
2421  Args    : None
2422  Notes   : This method implemented by Kevin Howe calculates a figure that is
2423            designed to be similar to the average pairwise identity of the
2424            alignment (identical in the absence of gaps), without having to
2425            explicitly calculate pairwise identities proposed by Richard Durbin.
2426            Validated by Ewan Birney ad Alex Bateman.
2427
2428 =cut
2429
2430 sub average_percentage_identity{
2431    my ($self,@args) = @_;
2432
2433    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2434                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2435
2436    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2437
2438    if (! $self->is_flush()) {
2439        $self->throw("All sequences in the alignment must be the same length");
2440    }
2441
2442    @seqs = $self->each_seq();
2443    $len = $self->length();
2444
2445    # load the each hash with correct keys for existence checks
2446
2447    for( my $index=0; $index < $len; $index++) {
2448        foreach my $letter (@alphabet) {
2449            $countHashes[$index]->{$letter} = 0;
2450        }
2451    }
2452    foreach my $seq (@seqs)  {
2453        my @seqChars = split //, $seq->seq();
2454        for( my $column=0; $column < @seqChars; $column++ ) {
2455            my $char = uc($seqChars[$column]);
2456            if (exists $countHashes[$column]->{$char}) {
2457                $countHashes[$column]->{$char}++;
2458            }
2459        }
2460    }
2461
2462    $total = 0;
2463    $divisor = 0;
2464    for(my $column =0; $column < $len; $column++) {
2465        my %hash = %{$countHashes[$column]};
2466        $subdivisor = 0;
2467        foreach my $res (keys %hash) {
2468            $total += $hash{$res}*($hash{$res} - 1);
2469            $subdivisor += $hash{$res};
2470        }
2471        $divisor += $subdivisor * ($subdivisor - 1);
2472    }
2473    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2474 }
2475
2476 =head2 percentage_identity
2477
2478  Title   : percentage_identity
2479  Usage   : $id = $align->percentage_identity
2480  Function: The function calculates the average percentage identity
2481            (aliased to average_percentage_identity)
2482  Returns : The average percentage identity
2483  Args    : None
2484
2485 =cut
2486
2487 sub percentage_identity {
2488     my $self = shift;
2489     return $self->average_percentage_identity();
2490 }
2491
2492 =head2 overall_percentage_identity
2493
2494  Title   : overall_percentage_identity
2495  Usage   : $id = $align->overall_percentage_identity
2496            $id = $align->overall_percentage_identity('short')
2497  Function: The function calculates the percentage identity of
2498            the conserved columns
2499  Returns : The percentage identity of the conserved columns
2500  Args    : length value to use, optional defaults to alignment length
2501                  possible values: 'align', 'short', 'long'
2502
2503 The argument values 'short' and 'long' refer to shortest and longest
2504 sequence in the alignment. Method modification code by Hongyu Zhang.
2505
2506 =cut
2507
2508 sub overall_percentage_identity{
2509    my ($self, $length_measure) = @_;
2510
2511    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2512
2513    my %enum = map {$_ => undef} qw (align short long);
2514
2515    $self->throw("Unknown argument [$length_measure]")
2516        if $length_measure and not exists $enum{$length_measure};
2517    $length_measure ||= 'align';
2518
2519    if (! $self->is_flush()) {
2520        $self->throw("All sequences in the alignment must be the same length");
2521    }
2522
2523    # Count the residues seen at each position
2524    my $len;
2525    my $total = 0; # number of positions with identical residues
2526    my @countHashes;
2527    my @seqs = $self->each_seq;
2528    my $nof_seqs = scalar @seqs;
2529    my $aln_len = $self->length();
2530    for my $seq (@seqs)  {
2531        my $seqstr = $seq->seq;
2532
2533        # Count residues for given sequence
2534        for my $column (0 .. $aln_len-1) {
2535            my $char = uc( substr($seqstr, $column, 1) );
2536            if ( exists $alphabet{$char} ) {
2537
2538                # This is a valid char
2539                if ( defined $countHashes[$column]->{$char} ) {
2540                  $countHashes[$column]->{$char}++;
2541                } else {
2542                  $countHashes[$column]->{$char} = 1;
2543                }
2544
2545                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2546                    # All sequences have this same residue
2547                    $total++;
2548                }
2549
2550            }
2551        }
2552
2553        # Sequence length
2554        if ($length_measure eq 'short' || $length_measure eq 'long') {
2555            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2556            if ($length_measure eq 'short') {
2557                if ( (not defined $len) || ($seq_len < $len) ) {
2558                    $len = $seq_len;
2559                }
2560            } elsif ($length_measure eq 'long') {
2561                if ( (not defined $len) || ($seq_len > $len) ) {
2562                    $len = $seq_len;
2563                }
2564            }
2565        }
2566
2567    }
2568
2569    if ($length_measure eq 'align') {
2570        $len = $aln_len;
2571    }
2572
2573    return ($total / $len ) * 100.0;
2574 }
2575
2576
2577
2578 =head1 Alignment positions
2579
2580 Methods to map a sequence position into an alignment column and back.
2581 column_from_residue_number() does the former. The latter is really a
2582 property of the sequence object and can done using
2583 L<Bio::LocatableSeq::location_from_column>:
2584
2585     # select somehow a sequence from the alignment, e.g.
2586     my $seq = $aln->get_seq_by_pos(1);
2587     #$loc is undef or Bio::LocationI object
2588     my $loc = $seq->location_from_column(5);
2589
2590 =head2 column_from_residue_number
2591
2592  Title   : column_from_residue_number
2593  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2594  Function: This function gives the position in the alignment
2595            (i.e. column number) of the given residue number in the
2596            sequence with the given name. For example, for the
2597            alignment
2598
2599              Seq1/91-97 AC..DEF.GH.
2600              Seq2/24-30 ACGG.RTY...
2601                 Seq3/43-51 AC.DDEF.GHI
2602
2603            column_from_residue_number( "Seq1", 94 ) returns 6.
2604            column_from_residue_number( "Seq2", 25 ) returns 2.
2605            column_from_residue_number( "Seq3", 50 ) returns 10.
2606
2607            An exception is thrown if the residue number would lie
2608            outside the length of the aligment
2609            (e.g. column_from_residue_number( "Seq2", 22 )
2610
2611           Note: If the the parent sequence is represented by more than
2612                 one alignment sequence and the residue number is present in
2613                 them, this method finds only the first one.
2614
2615  Returns : A column number for the position in the alignment of the
2616            given residue in the given sequence (1 = first column)
2617  Args    : A sequence id/name (not a name/start-end)
2618            A residue number in the whole sequence (not just that
2619            segment of it in the alignment)
2620
2621 =cut
2622
2623 sub column_from_residue_number {
2624     my ($self, $name, $resnumber) = @_;
2625
2626     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2627     $self->throw("Second argument residue number missing") unless $resnumber;
2628
2629     foreach my $seq ($self->each_seq_with_id($name)) {
2630         my $col;
2631         eval {
2632             $col = $seq->column_from_residue_number($resnumber);
2633         };
2634         next if $@;
2635         return $col;
2636     }
2637
2638     $self->throw("Could not find a sequence segment in $name ".
2639                  "containing residue number $resnumber");
2640
2641 }
2642
2643 =head1 Sequence names
2644
2645 Methods to manipulate the display name. The default name based on the
2646 sequence id and subsequence positions can be overridden in various
2647 ways.
2648
2649 =head2 displayname
2650
2651  Title     : displayname
2652  Usage     : $myalign->displayname("Ig", "IgA")
2653  Function  : Gets/sets the display name of a sequence in the alignment
2654  Returns   : A display name string
2655  Argument  : name of the sequence
2656              displayname of the sequence (optional)
2657
2658 =cut
2659
2660 sub get_displayname {
2661     my $self = shift;
2662     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2663     $self->displayname(@_);
2664 }
2665
2666 sub set_displayname {
2667     my $self = shift;
2668     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2669     $self->displayname(@_);
2670 }
2671
2672 sub displayname {
2673     my ($self, $name, $disname) = @_;
2674
2675     $self->throw("No sequence with name [$name]")
2676         unless defined $self->{'_seq'}->{$name};
2677
2678     if(  $disname and  $name) {
2679         $self->{'_dis_name'}->{$name} = $disname;
2680         return $disname;
2681     }
2682     elsif( defined $self->{'_dis_name'}->{$name} ) {
2683         return  $self->{'_dis_name'}->{$name};
2684     } else {
2685         return $name;
2686     }
2687 }
2688
2689 =head2 set_displayname_count
2690
2691  Title     : set_displayname_count
2692  Usage     : $ali->set_displayname_count
2693  Function  : Sets the names to be name_# where # is the number of
2694              times this name has been used.
2695  Returns   : 1, on success
2696  Argument  :
2697
2698 =cut
2699
2700 sub set_displayname_count {
2701     my $self= shift;
2702     my (@arr,$name,$seq,$count,$temp,$nse);
2703
2704     foreach $seq ( $self->each_alphabetically() ) {
2705         $nse = $seq->get_nse();
2706
2707         #name will be set when this is the second
2708         #time (or greater) is has been seen
2709
2710         if( defined $name and $name eq ($seq->id()) ) {
2711             $temp = sprintf("%s_%s",$name,$count);
2712             $self->displayname($nse,$temp);
2713             $count++;
2714         } else {
2715             $count = 1;
2716             $name = $seq->id();
2717             $temp = sprintf("%s_%s",$name,$count);
2718             $self->displayname($nse,$temp);
2719             $count++;
2720         }
2721     }
2722     return 1;
2723 }
2724
2725 =head2 set_displayname_flat
2726
2727  Title     : set_displayname_flat
2728  Usage     : $ali->set_displayname_flat()
2729  Function  : Makes all the sequences be displayed as just their name,
2730              not name/start-end
2731  Returns   : 1
2732  Argument  :
2733
2734 =cut
2735
2736 sub set_displayname_flat {
2737     my $self = shift;
2738     my ($nse,$seq);
2739
2740     foreach $seq ( $self->each_seq() ) {
2741         $nse = $seq->get_nse();
2742         $self->displayname($nse,$seq->id());
2743     }
2744     return 1;
2745 }
2746
2747 =head2 set_displayname_normal
2748
2749  Title     : set_displayname_normal
2750  Usage     : $ali->set_displayname_normal()
2751  Function  : Makes all the sequences be displayed as name/start-end
2752  Returns   : 1, on success
2753  Argument  :
2754
2755 =cut
2756
2757 sub set_displayname_normal {
2758     my $self = shift;
2759     my ($nse,$seq);
2760
2761     foreach $seq ( $self->each_seq() ) {
2762         $nse = $seq->get_nse();
2763         $self->displayname($nse,$nse);
2764     }
2765     return 1;
2766 }
2767
2768 =head2 source
2769
2770  Title   : source
2771  Usage   : $obj->source($newval)
2772  Function: sets the Alignment source program
2773  Example :
2774  Returns : value of source
2775  Args    : newvalue (optional)
2776
2777
2778 =cut
2779
2780 sub source{
2781    my ($self,$value) = @_;
2782    if( defined $value) {
2783       $self->{'_source'} = $value;
2784     }
2785     return $self->{'_source'};
2786 }
2787
2788 =head2 set_displayname_safe
2789
2790  Title     : set_displayname_safe
2791  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2792  Function  : Assign machine-generated serial names to sequences in input order.
2793              Designed to protect names during PHYLIP runs. Assign 10-char string
2794              in the form of "S000000001" to "S999999999". Restore the original
2795              names using "restore_displayname".
2796  Returns   : 1. a new $aln with system names;
2797              2. a hash ref for restoring names
2798  Argument  : Number for id length (default 10)
2799
2800 =cut
2801
2802 sub set_displayname_safe {
2803     my $self = shift;
2804     my $idlength = shift || 10;
2805     my ($seq, %phylip_name);
2806     my $ct=0;
2807     my $new=Bio::SimpleAlign->new();
2808     foreach $seq ( $self->each_seq() ) {
2809         $ct++;
2810         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2811         $phylip_name{$pname}=$seq->id();
2812         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2813                                             -seq      => $seq->seq(),
2814                                             -alphabet => $seq->alphabet,
2815                                             -start    => $seq->{_start},
2816                                             -end      => $seq->{_end}
2817                                             );
2818         $new->add_seq($new_seq);
2819     }
2820
2821     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2822     return ($new, \%phylip_name);
2823 }
2824
2825 =head2 restore_displayname
2826
2827  Title     : restore_displayname
2828  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2829  Function  : Restore original sequence names (after running
2830              $ali->set_displayname_safe)
2831  Returns   : a new $aln with names restored.
2832  Argument  : a hash reference of names from "set_displayname_safe".
2833
2834 =cut
2835
2836 sub restore_displayname {
2837     my $self = shift;
2838     my $ref=shift;
2839     my %name=%$ref;
2840     my $new=Bio::SimpleAlign->new();
2841     foreach my $seq ( $self->each_seq() ) {
2842       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2843       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2844                                           -seq      => $seq->seq(),
2845                                           -alphabet => $seq->alphabet,
2846                                           -start    => $seq->{_start},
2847                                           -end      => $seq->{_end}
2848                                           );
2849       $new->add_seq($new_seq);
2850     }
2851     return $new;
2852 }
2853
2854 =head2 sort_by_start
2855
2856  Title     : sort_by_start
2857  Usage     : $ali->sort_by_start
2858  Function  : Changes the order of the alignment to the start position of each
2859              subalignment
2860  Returns   :
2861  Argument  :
2862
2863 =cut
2864
2865 sub sort_by_start {
2866     my $self = shift;
2867     my ($seq,$nse,@arr,%hash,$count);
2868     foreach $seq ( $self->each_seq() ) {
2869         $nse = $seq->get_nse;
2870         $hash{$nse} = $seq;
2871     }
2872     $count = 0;
2873     %{$self->{'_order'}} = (); # reset the hash;
2874     foreach $nse ( sort _startend keys %hash) {
2875         $self->{'_order'}->{$count} = $nse;
2876         $count++;
2877     }
2878     1;
2879 }
2880
2881 sub _startend
2882 {
2883     my ($aname,$arange) = split (/[\/]/,$a);
2884     my ($bname,$brange) = split (/[\/]/,$b);
2885     my ($astart,$aend) = split(/\-/,$arange);
2886     my ($bstart,$bend) = split(/\-/,$brange);
2887     return $astart <=> $bstart;
2888 }
2889
2890 =head2 bracket_string
2891
2892  Title     : bracket_string
2893  Usage     : my @params = (-refseq     => 'testseq',
2894                            -allele1    => 'allele1',
2895                            -allele2    => 'allele2',
2896                            -delimiters => '{}',
2897                            -separator  => '/');
2898              $str = $aln->bracket_string(@params)
2899
2900  Function :  When supplied with a list of parameters (see below), returns a
2901              string in BIC format. This is used for allelic comparisons.
2902              Briefly, if either allele contains a base change when compared to
2903              the refseq, the base or gap for each allele is represented in
2904              brackets in the order present in the 'alleles' parameter.
2905
2906              For the following data:
2907
2908              >testseq
2909              GGATCCATTGCTACT
2910              >allele1
2911              GGATCCATTCCTACT
2912              >allele2
2913              GGAT--ATTCCTCCT
2914
2915              the returned string with parameters 'refseq => testseq' and
2916              'alleles => [qw(allele1 allele2)]' would be:
2917
2918              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2919  Returns   : BIC-formatted string
2920  Argument  : Required args
2921                 refseq    : string (ID) of the reference sequence used
2922                             as basis for comparison
2923                 allele1   : string (ID) of the first allele
2924                 allele2   : string (ID) of the second allele
2925              Optional args
2926                 delimiters: two symbol string of left and right delimiters.
2927                             Only the first two symbols are used
2928                             default = '[]'
2929                 separator : string used as a separator.  Only the first
2930                             symbol is used
2931                             default = '/'
2932  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2933
2934 =cut
2935
2936 sub bracket_string {
2937     my ($self, @args) = @_;
2938     my ($ref, $a1, $a2, $delim, $sep) =
2939         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2940     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2941     my ($ld, $rd);
2942     ($ld, $rd) = split('', $delim, 2) if $delim;
2943     $ld ||= '[';
2944     $rd ||= ']';
2945     $sep ||= '/';
2946     my ($refseq, $allele1, $allele2) =
2947         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2948     if (!$refseq || !$allele1 || !$allele2) {
2949         $self->throw("One of your refseq/allele IDs is invalid!");
2950     }
2951     my $len = $self->length-1;
2952     my $bic = '';
2953     # loop over the alignment columns
2954     for my $column ( 0 .. $len ) {
2955         my $string;
2956         my ($compres, $res1, $res2) =
2957             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2958         # are any of the allele symbols different from the refseq?
2959         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2960                 $ld.$res1.$sep.$res2.$rd;
2961         $bic .= $string;
2962     }
2963     return $bic;
2964 }
2965
2966
2967 =head2 methods implementing Bio::FeatureHolderI
2968
2969 FeatureHolderI implementation to support labeled character sets like one
2970 would get from NEXUS represented data.
2971
2972 =head2 get_SeqFeatures
2973
2974  Usage   : @features = $aln->get_SeqFeatures
2975  Function: Get the feature objects held by this feature holder.
2976  Example :
2977  Returns : an array of Bio::SeqFeatureI implementing objects
2978  Args    : optional filter coderef, taking a Bio::SeqFeatureI
2979          : as argument, returning TRUE if wanted, FALSE if
2980          : unwanted
2981
2982 =cut
2983
2984 sub get_SeqFeatures {
2985     my $self = shift;
2986     my $filter_cb = shift;
2987     $self->throw("Arg (filter callback) must be a coderef") unless
2988         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2989     if( !defined $self->{'_as_feat'} ) {
2990         $self->{'_as_feat'} = [];
2991     }
2992     if ($filter_cb) {
2993         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2994     }
2995     return @{$self->{'_as_feat'}};
2996 }
2997
2998 =head2 add_SeqFeature
2999
3000  Usage   : $aln->add_SeqFeature($subfeat);
3001  Function: adds a SeqFeature into the SeqFeature array.
3002  Example :
3003  Returns : true on success
3004  Args    : a Bio::SeqFeatureI object
3005  Note    : This implementation is not compliant
3006            with Bio::FeatureHolderI
3007
3008 =cut
3009
3010 sub add_SeqFeature {
3011    my ($self,@feat) = @_;
3012
3013    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3014
3015    foreach my $feat ( @feat ) {
3016        if( !$feat->isa("Bio::SeqFeatureI") ) {
3017            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3018        }
3019
3020        push(@{$self->{'_as_feat'}},$feat);
3021    }
3022    return 1;
3023 }
3024
3025
3026 =head2 remove_SeqFeatures
3027
3028  Usage   : $obj->remove_SeqFeatures
3029  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3030            remove that subset from the returned array, and add back the rest.
3031  Returns : The array of Bio::SeqFeatureI features that was
3032            deleted from this alignment.
3033  Args    : none
3034
3035 =cut
3036
3037 sub remove_SeqFeatures {
3038     my $self = shift;
3039
3040     return () unless $self->{'_as_feat'};
3041     my @feats = @{$self->{'_as_feat'}};
3042     $self->{'_as_feat'} = [];
3043     return @feats;
3044 }
3045
3046 =head2 feature_count
3047
3048  Title   : feature_count
3049  Usage   : $obj->feature_count()
3050  Function: Return the number of SeqFeatures attached to the alignment
3051  Returns : integer representing the number of SeqFeatures
3052  Args    : None
3053
3054 =cut
3055
3056 sub feature_count {
3057     my ($self) = @_;
3058
3059     if (defined($self->{'_as_feat'})) {
3060         return ($#{$self->{'_as_feat'}} + 1);
3061     } else {
3062         return 0;
3063     }
3064 }
3065
3066 =head2 get_all_SeqFeatures
3067
3068  Title   : get_all_SeqFeatures
3069  Usage   :
3070  Function: Get all SeqFeatures.
3071  Example :
3072  Returns : an array of Bio::SeqFeatureI implementing objects
3073  Args    : none
3074  Note    : Falls through to Bio::FeatureHolderI implementation.
3075
3076 =cut
3077
3078 =head2 methods for Bio::AnnotatableI
3079
3080 AnnotatableI implementation to support sequence alignments which
3081 contain annotation (NEXUS, Stockholm).
3082
3083 =head2 annotation
3084
3085  Title   : annotation
3086  Usage   : $ann = $aln->annotation or
3087            $aln->annotation($ann)
3088  Function: Gets or sets the annotation
3089  Returns : Bio::AnnotationCollectionI object
3090  Args    : None or Bio::AnnotationCollectionI object
3091
3092 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3093 for more information
3094
3095 =cut
3096
3097 sub annotation {
3098     my ($obj,$value) = @_;
3099     if( defined $value ) {
3100         $obj->throw("object of class ".ref($value)." does not implement ".
3101                 "Bio::AnnotationCollectionI. Too bad.")
3102             unless $value->isa("Bio::AnnotationCollectionI");
3103         $obj->{'_annotation'} = $value;
3104     } elsif( ! defined $obj->{'_annotation'}) {
3105         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3106     }
3107     return $obj->{'_annotation'};
3108 }
3109
3110 =head1 Deprecated methods
3111
3112 =cut
3113
3114 =head2 no_residues
3115
3116  Title     : no_residues
3117  Usage     : $no = $ali->no_residues
3118  Function  : number of residues in total in the alignment
3119  Returns   : integer
3120  Argument  :
3121  Note      : deprecated in favor of num_residues()
3122
3123 =cut
3124
3125 sub no_residues {
3126         my $self = shift;
3127         $self->deprecated(-warn_version => 1.0069,
3128                                           -throw_version => 1.0075,
3129                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3130     $self->num_residues(@_);
3131 }
3132
3133 =head2 no_sequences
3134
3135  Title     : no_sequences
3136  Usage     : $depth = $ali->no_sequences
3137  Function  : number of sequence in the sequence alignment
3138  Returns   : integer
3139  Argument  :
3140  Note      : deprecated in favor of num_sequences()
3141
3142 =cut
3143
3144 sub no_sequences {
3145         my $self = shift;
3146         $self->deprecated(-warn_version => 1.0069,
3147                                           -throw_version => 1.0075,
3148                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3149     $self->num_sequences(@_);
3150 }
3151
3152 =head2 mask_columns
3153
3154  Title     : mask_columns
3155  Usage     : $aln2 = $aln->mask_columns(20,30)
3156  Function  : Masks a slice of the alignment inclusive of start and
3157              end columns, and the first column in the alignment is denoted 1.
3158              Mask beyond the length of the sequence does not do padding.
3159  Returns   : A Bio::SimpleAlign object
3160  Args      : Positive integer for start column, positive integer for end column,
3161              optional string value use for the mask. Example:
3162
3163              $aln2 = $aln->mask_columns(20,30,'?')
3164  Note      : Masking must use a character that is not used for gaps or
3165              frameshifts.  These can be adjusted using the relevant global
3166              variables, but be aware these may be (uncontrollably) modified
3167              elsewhere within BioPerl (see bug 2715)
3168
3169 =cut
3170
3171 sub mask_columns {
3172     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3173     my $self = shift;
3174
3175     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3176              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3177
3178     # coordinates are alignment-based, not sequence-based
3179     my ($start, $end, $mask_char) = @_;
3180     unless (defined $mask_char) { $mask_char = 'N' }
3181
3182     $self->throw("Mask start has to be a positive integer and less than ".
3183                  "alignment length, not [$start]")
3184       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3185     $self->throw("Mask end has to be a positive integer and less than ".
3186                  "alignment length, not [$end]")
3187       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3188     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3189                  "end [$end]") unless $start <= $end;
3190     $self->throw("Mask character $mask_char has to be a single character ".
3191                  "and not a gap or frameshift symbol")
3192       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3193
3194     my $aln = $self->new;
3195     $aln->id($self->id);
3196     foreach my $seq ( $self->each_seq() ) {
3197         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3198          -alphabet => $seq->alphabet,
3199          -strand  => $seq->strand,
3200          -verbose => $self->verbose);
3201
3202         # convert from 1-based alignment coords!
3203         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3204         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3205         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3206         $new_seq->seq($new_dna_string);
3207         $aln->add_seq($new_seq);
3208     }
3209     return $aln;
3210 }
3211
3212
3213
3214 1;