Bio/SimpleAlign.pm

   1 # BioPerl module for SimpleAlign
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12 #
  13 #  History:
  14 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  15 #       May 2001 major rewrite - Heikki Lehvaslaiho
  16
  17 =head1 NAME
  18
  19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  20
  21 =head1 SYNOPSIS
  22
  23   # Use Bio::AlignIO to read in the alignment
  24   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  25   $aln = $str->next_aln();
  26
  27   # Describe
  28   print $aln->length;
  29   print $aln->num_residues;
  30   print $aln->is_flush;
  31   print $aln->num_sequences;
  32   print $aln->score;
  33   print $aln->percentage_identity;
  34   print $aln->consensus_string(50);
  35
  36   # Find the position in the alignment for a sequence location
  37   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  38
  39   # Extract sequences and check values for the alignment column $pos
  40   foreach $seq ($aln->each_seq) {
  41       $res = $seq->subseq($pos, $pos);
  42       $count{$res}++;
  43   }
  44   foreach $res (keys %count) {
  45       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  46   }
  47
  48   # Manipulate
  49   $aln->remove_seq($seq);
  50   $mini_aln = $aln->slice(20,30);  # get a block of columns
  51   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  52   $new_aln = $aln->remove_columns([20,30]); # remove by position
  53   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  54
  55   # Analyze
  56   $str = $aln->consensus_string($threshold_percent);
  57   $str = $aln->match_line();
  58   $str = $aln->cigar_line();
  59   $id = $aln->percentage_identity;
  60
  61   # See the module documentation for details and more methods.
  62
  63 =head1 DESCRIPTION
  64
  65 SimpleAlign is an object that handles a multiple sequence alignment
  66 (MSA). It is very permissive of types (it does not insist on sequences
  67 being all same length, for example). Think of it as a set of sequences
  68 with a whole series of built-in manipulations and methods for reading and
  69 writing alignments.
  70
  71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  72 to store its sequences. These are subsequences with a start and end
  73 positions in the parent reference sequence. Each sequence in the
  74 SimpleAlign object is a Bio::LocatableSeq.
  75
  76 SimpleAlign expects the combination of name, start, and end for a
  77 given sequence to be unique in the alignment, and this is the key for the
  78 internal hashes (name, start, end are abbreviated C<nse> in the code).
  79 However, in some cases people do not want the name/start-end to be displayed:
  80 either multiple names in an alignment or names specific to the alignment
  81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  82 C<displayname>, and generally is what is used to print out the
  83 alignment. They default to name/start-end.
  84
  85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules.  Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 I<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution. Bug reports can be submitted via the
 113 web:
 114
 115   https://redmine.open-bio.org/projects/bioperl/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@ebi.ac.uk
 120
 121 =head1 CONTRIBUTORS
 122
 123 Allen Day, allenday-at-ucla.edu,
 124 Richard Adams, Richard.Adams-at-ed.ac.uk,
 125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 127 Allen Smith, allens-at-cpan.org,
 128 Jason Stajich, jason-at-bioperl.org,
 129 Anthony Underwood, aunderwood-at-phls.org.uk,
 130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 131 Brian Osborne, bosborne at alum.mit.edu
 132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 133 Hongyu Zhang, forward at hongyu.org
 134 Jay Hannah, jay at jays.net
 135 Alexandr Bezginov, albezg at gmail.com
 136
 137 =head1 SEE ALSO
 138
 139 L<Bio::LocatableSeq>
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148 # 'Let the code begin...
 149
 150 package Bio::SimpleAlign;
 151 use vars qw(%CONSERVATION_GROUPS);
 152 use strict;
 153
 154 use Bio::LocatableSeq;  # uses Seq's as list
 155
 156 use Bio::Seq;
 157 use Bio::SeqFeature::Generic;
 158
 159 BEGIN {
 160     # This data should probably be in a more centralized module...
 161     # it is taken from Clustalw documentation.
 162     # These are all the positively scoring groups that occur in the
 163     # Gonnet Pam250 matrix. The strong and weak groups are
 164     # defined as strong score >0.5 and weak score =<0.5 respectively.
 165
 166     %CONSERVATION_GROUPS = (
 167             'strong' => [ qw(
 168                                                  STA
 169                                                  NEQK
 170                                                  NHQK
 171                                                  NDEQ
 172                                                  QHRK
 173                                                  MILV
 174                                                  MILF
 175                                                  HY
 176                                                  FYW )],
 177                                 'weak' => [ qw(
 178                       CSA
 179                                                ATV
 180                                                SAG
 181                                                STNK
 182                                                STPA
 183                                                SGND
 184                                                SNDEQK
 185                                                NDEQHK
 186                                                NEQHRK
 187                                                FVLIM
 188                                                HFY )],);
 189 }
 190
 191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 192             Bio::FeatureHolderI);
 193
 194 =head2 new
 195
 196  Title     : new
 197  Usage     : my $aln = Bio::SimpleAlign->new();
 198  Function  : Creates a new simple align object
 199  Returns   : Bio::SimpleAlign
 200  Args      : -source     => string representing the source program
 201                             where this alignment came from
 202              -annotation => Bio::AnnotationCollectionI
 203              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 204              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 205              -consensus  => consensus string
 206              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 207
 208 =cut
 209
 210
 211 sub new {
 212   my($class,@args) = @_;
 213
 214   my $self = $class->SUPER::new(@args);
 215
 216   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 217                                             SOURCE
 218                                             SCORE
 219                                             ID
 220                                             ACCESSION
 221                                             DESCRIPTION
 222                                             SEQS
 223                                             FEATURES
 224                                             ANNOTATION
 225                                             SEQ_ANNOTATION
 226                                             CONSENSUS
 227                                             CONSENSUS_META
 228                                             )], @args);
 229   $src && $self->source($src);
 230   defined $score && $self->score($score);
 231   # we need to set up internal hashs first!
 232
 233   $self->{'_seq'} = {};
 234   $self->{'_order'} = {};
 235   $self->{'_start_end_lists'} = {};
 236   $self->{'_dis_name'} = {};
 237   $self->{'_id'} = 'NoName';
 238   # maybe we should automatically read in from args. Hmmm...
 239   $id  && $self->id($id);
 240   $acc && $self->accession($acc);
 241   $desc && $self->description($desc);
 242   $coll && $self->annotation($coll);
 243   # sequence annotation is layered into a provided annotation collection (or dies)
 244   if ($sa) {
 245     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 246                  "with a sequence annotation collection")
 247         if !$coll;
 248     $coll->add_Annotation('seq_annotation', $sa);
 249   }
 250   if ($feats && ref $feats eq 'ARRAY') {
 251     for my $feat (@$feats) {
 252         $self->add_SeqFeature($feat);
 253     }
 254   }
 255   $con && $self->consensus($con);
 256   $cmeta && $self->consensus_meta($cmeta);
 257   # assumes these are in correct alignment order
 258   if ($seqs && ref($seqs) eq 'ARRAY') {
 259     for my $seq (@$seqs) {
 260         $self->add_seq($seq);
 261     }
 262   }
 263
 264   return $self; # success - we hope!
 265 }
 266
 267 =head1 Modifier methods
 268
 269 These methods modify the MSA by adding, removing or shuffling complete
 270 sequences.
 271
 272 =head2 add_seq
 273
 274  Title     : add_seq
 275  Usage     : $myalign->add_seq($newseq);
 276              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 277  Function  : Adds another sequence to the alignment. *Does not* align
 278              it - just adds it to the hashes.
 279              If -ORDER is specified, the sequence is inserted at the
 280              the position spec'd by -ORDER, and existing sequences
 281              are pushed down the storage array.
 282  Returns   : nothing
 283  Args      : A Bio::LocatableSeq object
 284              Positive integer for the sequence position (optional)
 285
 286 See L<Bio::LocatableSeq> for more information
 287
 288 =cut
 289
 290 sub addSeq {
 291     my $self = shift;
 292     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 293     $self->add_seq(@_);
 294 }
 295
 296 sub add_seq {
 297     my $self = shift;
 298     my @args = @_;
 299     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 300     my ($name,$id,$start,$end);
 301
 302     unless ($seq) {
 303         $self->throw("LocatableSeq argument required");
 304     }
 305     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 306         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 307     }
 308     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 309     $order--; # jay's patch (user-specified order is 1-origin)
 310
 311     if ($order < 0) {
 312         $self->throw("User-specified value for ORDER must be >= 1");
 313     }
 314
 315     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 316
 317     # build the symbol list for this sequence,
 318     # will prune out the gap and missing/match chars
 319     # when actually asked for the symbol list in the
 320     # symbol_chars
 321     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 322
 323     $name = $seq->get_nse;
 324
 325     if( $self->{'_seq'}->{$name} ) {
 326         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 327     }
 328     else {
 329         $self->debug( "Assigning $name to $order\n");
 330
 331     my $ordh = $self->{'_order'};
 332     if ($ordh->{$order}) {
 333         # make space to insert
 334         # $c->() returns (in reverse order) the first subsequence
 335         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 336         # (3,2,1), and $c->(2,4,5) returns (2).
 337         my $c;
 338         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 339         map {
 340      $ordh->{$_+1} = $ordh->{$_}
 341         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 342
 343     }
 344     $ordh->{$order} = $name;
 345
 346         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 347             $self->{'_start_end_lists'}->{$id} = [];
 348         }
 349         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 350     }
 351
 352     $self->{'_seq'}->{$name} = $seq;
 353
 354 }
 355
 356
 357 =head2 remove_seq
 358
 359  Title     : remove_seq
 360  Usage     : $aln->remove_seq($seq);
 361  Function  : Removes a single sequence from an alignment
 362  Returns   :
 363  Argument  : a Bio::LocatableSeq object
 364
 365 =cut
 366
 367 sub removeSeq {
 368     my $self = shift;
 369     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 370     $self->remove_seq(@_);
 371 }
 372
 373 sub remove_seq {
 374     my $self = shift;
 375     my $seq = shift;
 376     my ($name,$id);
 377
 378     $self->throw("Need Bio::Locatable seq argument ")
 379         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 380
 381     $id = $seq->id();
 382     $name = $seq->get_nse;
 383
 384     if( !exists $self->{'_seq'}->{$name} ) {
 385         $self->throw("Sequence $name does not exist in the alignment to remove!");
 386     }
 387
 388     delete $self->{'_seq'}->{$name};
 389
 390     # we need to remove this seq from the start_end_lists hash
 391
 392     if (exists $self->{'_start_end_lists'}->{$id}) {
 393         # we need to find the sequence in the array.
 394
 395         my ($i, $found);;
 396         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 397             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 398                 $found = 1;
 399                 last;
 400             }
 401         }
 402         if ($found) {
 403             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 404         }
 405         else {
 406             $self->throw("Could not find the sequence to remoce from the start-end list");
 407         }
 408     }
 409     else {
 410         $self->throw("There is no seq list for the name $id");
 411     }
 412     # we need to shift order hash
 413     my %rev_order = reverse %{$self->{'_order'}};
 414     my $no = $rev_order{$name};
 415     my $num_sequences = $self->num_sequences;
 416     for (; $no < $num_sequences; $no++) {
 417        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 418     }
 419     delete $self->{'_order'}->{$no};
 420     return 1;
 421 }
 422
 423
 424 =head2 purge
 425
 426  Title   : purge
 427  Usage   : $aln->purge(0.7);
 428  Function: Removes sequences above given sequence similarity
 429            This function will grind on large alignments. Beware!
 430  Example :
 431  Returns : An array of the removed sequences
 432  Args    : float, threshold for similarity
 433
 434 =cut
 435
 436 sub purge {
 437         my ($self,$perc) = @_;
 438         my (%duplicate, @dups);
 439
 440         my @seqs = $self->each_seq();
 441
 442         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 443                 my $seq = $seqs[$i];
 444
 445                 #skip if already in duplicate hash
 446                 next if exists $duplicate{$seq->display_id} ;
 447                 my $one = $seq->seq();
 448
 449                 my @one = split '', $one;       #split to get 1aa per array element
 450
 451                 for (my $j=$i+1;$j < @seqs;$j++) {
 452                         my $seq2 = $seqs[$j];
 453
 454                         #skip if already in duplicate hash
 455                         next if exists $duplicate{$seq2->display_id} ;
 456
 457                         my $two = $seq2->seq();
 458                         my @two = split '', $two;
 459
 460                         my $count = 0;
 461                         my $res = 0;
 462                         for (my $k=0;$k<@one;$k++) {
 463                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 464                                           $one[$k] eq $two[$k]) {
 465                                         $count++;
 466                                 }
 467                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 468                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 469                                         $res++;
 470                                 }
 471                         }
 472
 473                         my $ratio = 0;
 474                         $ratio = $count/$res unless $res == 0;
 475
 476                         # if above threshold put in duplicate hash and push onto
 477                         # duplicate array for returning to get_unique
 478                         if ( $ratio > $perc ) {
 479                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 480                                 $duplicate{$seq2->display_id} = 1;
 481                                 push @dups, $seq2;
 482                         }
 483                 }
 484         }
 485         foreach my $seq (@dups) {
 486                 $self->remove_seq($seq);
 487         }
 488         return @dups;
 489 }
 490
 491 =head2 sort_alphabetically
 492
 493  Title     : sort_alphabetically
 494  Usage     : $ali->sort_alphabetically
 495  Function  : Changes the order of the alignment to alphabetical on name
 496              followed by numerical by number.
 497  Returns   :
 498  Argument  :
 499
 500 =cut
 501
 502 sub sort_alphabetically {
 503     my $self = shift;
 504     my ($seq,$nse,@arr,%hash,$count);
 505
 506     foreach $seq ( $self->each_seq() ) {
 507         $nse = $seq->get_nse;
 508         $hash{$nse} = $seq;
 509     }
 510
 511     $count = 0;
 512
 513     %{$self->{'_order'}} = (); # reset the hash;
 514
 515     foreach $nse ( sort _alpha_startend keys %hash) {
 516         $self->{'_order'}->{$count} = $nse;
 517
 518         $count++;
 519     }
 520     1;
 521 }
 522
 523 =head2 sort_by_list
 524
 525  Title     : sort_by_list
 526  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 527  Function  : Arbitrarily order sequences in an alignment
 528  Returns   : A new Bio::SimpleAlign object
 529  Argument  : a file listing sequence names in intended order (one name per line)
 530
 531 =cut
 532
 533 sub sort_by_list {
 534     my ($self, $list) = @_;
 535     my (@seq, @ids, %order);
 536
 537     foreach my $seq ( $self->each_seq() ) {
 538         push @seq, $seq;
 539         push @ids, $seq->display_id;
 540     }
 541
 542     my $ct=1;
 543     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 544     while (<$listfh>) {
 545       chomp;
 546       my $name=$_;
 547       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 548       $order{$name}=$ct++;
 549     }
 550     close($listfh);
 551
 552     # use the map-sort-map idiom:
 553     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 554     my $aln = $self->new;
 555     foreach (@sorted) { $aln->add_seq($_) }
 556     return $aln;
 557 }
 558
 559 =head2 set_new_reference
 560
 561  Title     : set_new_reference
 562  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 563              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 564              as the reference (1st) sequence
 565  Function  : Change/Set a new reference (i.e., the first) sequence
 566  Returns   : a new Bio::SimpleAlign object.
 567              Throws an exception if designated sequence not found
 568  Argument  : a positive integer of sequence order, or a sequence name
 569              in the original alignment
 570
 571 =cut
 572
 573 sub set_new_reference {
 574     my ($self, $seqid) = @_;
 575     my $aln = $self->new;
 576     my (@seq, @ids, @new_seq);
 577     my $is_num=0;
 578     foreach my $seq ( $self->each_seq() ) {
 579         push @seq, $seq;
 580         push @ids, $seq->display_id;
 581     }
 582
 583     if ($seqid =~ /^\d+$/) { # argument is seq position
 584         $is_num=1;
 585         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 586     } else { # argument is a seq name
 587         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 588     }
 589
 590     for (my $i=0; $i<=$#seq; $i++) {
 591         my $pos=$i+1;
 592         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 593             unshift @new_seq, $seq[$i];
 594         } else {
 595             push @new_seq, $seq[$i];
 596         }
 597     }
 598     foreach (@new_seq) { $aln->add_seq($_);  }
 599     return $aln;
 600 }
 601
 602 sub _in_aln {  # check if input name exists in the alignment
 603     my ($str, $ref) = @_;
 604     foreach (@$ref) {
 605         return 1 if $str eq $_;
 606     }
 607     return 0;
 608 }
 609
 610
 611 =head2 uniq_seq
 612
 613  Title     : uniq_seq
 614  Usage     : $aln->uniq_seq():  Remove identical sequences in
 615              in the alignment.  Ambiguous base ("N", "n") and
 616              leading and ending gaps ("-") are NOT counted as
 617              differences.
 618  Function  : Make a new alignment of unique sequence types (STs)
 619  Returns   : 1a. if called in a scalar context,
 620                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 621              1b. if called in an array context,
 622                 a new Bio::SimpleAlign object, and a hashref whose keys
 623                 are sequence types, and whose values are arrayrefs to
 624                 lists of sequence ids within the corresponding sequence type
 625              2. if $aln->verbose > 0, ST of each sequence is sent to
 626                 STDERR (in a tabular format)
 627  Argument  : None
 628
 629 =cut
 630
 631 sub uniq_seq {
 632     my ($self, $seqid) = @_;
 633     my $aln = $self->new;
 634     my (%member, %order, @seq, @uniq_str, $st);
 635     my $order=0;
 636     my $len = $self->length();
 637     $st = {};
 638     foreach my $seq ( $self->each_seq() ) {
 639         my $str = $seq->seq();
 640
 641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 642 # comparing two sequence strings
 643
 644     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 645         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 646     # 2nd, convert leading and ending gaps to "?":
 647         $str = &_convert_leading_ending_gaps($str, '-', '?');
 648     # Note that '?' also can mean unknown residue.
 649     # I don't like making global class member changes like this, too
 650     # prone to errors... -- cjfields 08-11-18
 651     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 652         my $new = Bio::LocatableSeq->new(
 653                      -id      => $seq->id(),
 654                                          -alphabet=> $seq->alphabet,
 655                                          -seq     => $str,
 656                                          -start   => $seq->start,
 657                                          -end     => $seq->end
 658                                          );
 659         push @seq, $new;
 660     }
 661
 662     foreach my $seq (@seq) {
 663         my $str = $seq->seq();
 664         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 665         if ($seen) { # seen before
 666             my @memb = @{$member{$key}};
 667             push @memb, $seq;
 668             $member{$key} = \@memb;
 669         } else {  # not seen
 670             push @uniq_str, $key;
 671             $order++;
 672             $member{$key} = [ ($seq) ];
 673             $order{$key} = $order;
 674         }
 675     }
 676
 677     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 679         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 681         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 682         my $gap='-';
 683         my $end= CORE::length($str2);
 684         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 685         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 686                                          -seq  =>$str2,
 687                                          -start=>1,
 688                                          -end  =>$end
 689                                          );
 690         $aln->add_seq($new);
 691         foreach (@{$member{$str}}) {
 692             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 693         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 694         }
 695     }
 696     return wantarray ? ($aln, $st) : $aln;
 697 }
 698
 699 sub _check_uniq {  # check if same seq exists in the alignment
 700     my ($str1, $ref, $length) = @_;
 701     my @char1=split //, $str1;
 702     my @array=@$ref;
 703
 704     return (0, $str1) if @array==0; # not seen (1st sequence)
 705
 706     foreach my $str2 (@array) {
 707         my $diff=0;
 708         my @char2=split //, $str2;
 709         for (my $i=0; $i<=$length-1; $i++) {
 710             next if $char1[$i] eq '?';
 711             next if $char2[$i] eq '?';
 712             $diff++ if $char1[$i] ne $char2[$i];
 713         }
 714         return (1, $str2) if $diff == 0;  # seen before
 715     }
 716
 717     return (0, $str1); # not seen
 718 }
 719
 720 sub _convert_leading_ending_gaps {
 721     my $s=shift;
 722     my $sym1=shift;
 723     my $sym2=shift;
 724     my @array=split //, $s;
 725 # convert leading char:
 726     for (my $i=0; $i<=$#array; $i++) {
 727         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 728     }
 729 # convert ending char:
 730     for (my $i = $#array; $i>= 0; $i--) {
 731         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 732     }
 733     my $s_new=join '', @array;
 734     return $s_new;
 735 }
 736
 737 =head1 Sequence selection methods
 738
 739 Methods returning one or more sequences objects.
 740
 741 =head2 each_seq
 742
 743  Title     : each_seq
 744  Usage     : foreach $seq ( $align->each_seq() )
 745  Function  : Gets a Seq object from the alignment
 746  Returns   : Seq object
 747  Argument  :
 748
 749 =cut
 750
 751 sub eachSeq {
 752     my $self = shift;
 753     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 754     $self->each_seq();
 755 }
 756
 757 sub each_seq {
 758         my $self = shift;
 759         my (@arr,$order);
 760
 761         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 762                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 763                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 764                 }
 765         }
 766         return @arr;
 767 }
 768
 769
 770 =head2 each_alphabetically
 771
 772  Title     : each_alphabetically
 773  Usage     : foreach $seq ( $ali->each_alphabetically() )
 774  Function  : Returns a sequence object, but the objects are returned
 775              in alphabetically sorted order.
 776              Does not change the order of the alignment.
 777  Returns   : Seq object
 778  Argument  :
 779
 780 =cut
 781
 782 sub each_alphabetically {
 783         my $self = shift;
 784         my ($seq,$nse,@arr,%hash,$count);
 785
 786         foreach $seq ( $self->each_seq() ) {
 787                 $nse = $seq->get_nse;
 788                 $hash{$nse} = $seq;
 789         }
 790
 791         foreach $nse ( sort _alpha_startend keys %hash) {
 792                 push(@arr,$hash{$nse});
 793         }
 794         return @arr;
 795 }
 796
 797 sub _alpha_startend {
 798     my ($aname,$astart,$bname,$bstart);
 799     ($aname,$astart) = split (/-/,$a);
 800     ($bname,$bstart) = split (/-/,$b);
 801
 802     if( $aname eq $bname ) {
 803         return $astart <=> $bstart;
 804     }
 805     else {
 806         return $aname cmp $bname;
 807     }
 808 }
 809
 810 =head2 each_seq_with_id
 811
 812  Title     : each_seq_with_id
 813  Usage     : foreach $seq ( $align->each_seq_with_id() )
 814  Function  : Gets a Seq objects from the alignment, the contents
 815              being those sequences with the given name (there may be
 816              more than one)
 817  Returns   : Seq object
 818  Argument  : a seq name
 819
 820 =cut
 821
 822 sub eachSeqWithId {
 823     my $self = shift;
 824     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 825     $self->each_seq_with_id(@_);
 826 }
 827
 828 sub each_seq_with_id {
 829     my $self = shift;
 830     my $id = shift;
 831
 832     $self->throw("Method each_seq_with_id needs a sequence name argument")
 833         unless defined $id;
 834
 835     my (@arr, $seq);
 836
 837     if (exists($self->{'_start_end_lists'}->{$id})) {
 838         @arr = @{$self->{'_start_end_lists'}->{$id}};
 839     }
 840     return @arr;
 841 }
 842
 843 =head2 get_seq_by_pos
 844
 845  Title     : get_seq_by_pos
 846  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 847  Function  : Gets a sequence based on its position in the alignment.
 848              Numbering starts from 1.  Sequence positions larger than
 849              num_sequences() will thow an error.
 850  Returns   : a Bio::LocatableSeq object
 851  Args      : positive integer for the sequence position
 852
 853 =cut
 854
 855 sub get_seq_by_pos {
 856
 857     my $self = shift;
 858     my ($pos) = @_;
 859
 860     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 861         unless $pos =~ /^\d+$/ and $pos > 0;
 862     $self->throw("No sequence at position [$pos]")
 863         unless $pos <= $self->num_sequences ;
 864
 865     my $nse = $self->{'_order'}->{--$pos};
 866     return $self->{'_seq'}->{$nse};
 867 }
 868
 869 =head2 get_seq_by_id
 870
 871  Title     : get_seq_by_id
 872  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 873  Function  : Gets a sequence based on its name.
 874              Sequences that do not exist will warn and return undef
 875  Returns   : a Bio::LocatableSeq object
 876  Args      : string for sequence name
 877
 878 =cut
 879
 880 sub get_seq_by_id {
 881     my ($self,$name) = @_;
 882     unless( defined $name ) {
 883       $self->warn("Must provide a sequence name");
 884       return;
 885     }
 886     for my $seq ( values %{$self->{'_seq'}} ) {
 887       if ( $seq->id eq $name) {
 888         return $seq;
 889       }
 890     }
 891     return;
 892 }
 893
 894 =head2 seq_with_features
 895
 896  Title   : seq_with_features
 897  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 898                                           -consensus => 60
 899                                           -mask =>
 900            sub { my $consensus = shift;
 901
 902                  for my $i (1..5){
 903                     my $n = 'N' x $i;
 904                     my $q = '\?' x $i;
 905                     while($consensus =~ /[^?]$q[^?]/){
 906                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 907                     }
 908                   }
 909                  return $consensus;
 910                }
 911                                          );
 912  Function: produces a Bio::Seq object by first splicing gaps from -pos
 913            (by means of a splice_by_seq_pos() call), then creating
 914            features using non-? chars (by means of a consensus_string()
 915            call with stringency -consensus).
 916  Returns : a Bio::Seq object
 917  Args    : -pos : required. sequence from which to build the Bio::Seq
 918              object
 919            -consensus : optional, defaults to consensus_string()'s
 920              default cutoff value
 921            -mask : optional, a coderef to apply to consensus_string()'s
 922              output before building features.  this may be useful for
 923              closing gaps of 1 bp by masking over them with N, for
 924              instance
 925
 926 =cut
 927
 928 sub seq_with_features{
 929    my ($self,%arg) = @_;
 930
 931    #first do the preparatory splice
 932    $self->throw("must provide a -pos argument") unless $arg{-pos};
 933    $self->splice_by_seq_pos($arg{-pos});
 934
 935    my $consensus_string = $self->consensus_string($arg{-consensus});
 936    $consensus_string = $arg{-mask}->($consensus_string)
 937          if defined($arg{-mask});
 938
 939    my(@bs,@es);
 940
 941    push @bs, 1 if $consensus_string =~ /^[^?]/;
 942
 943    while($consensus_string =~ /\?[^?]/g){
 944          push @bs, pos($consensus_string);
 945    }
 946    while($consensus_string =~ /[^?]\?/g){
 947          push @es, pos($consensus_string);
 948    }
 949
 950    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 951
 952    my $seq = Bio::Seq->new();
 953
 954 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 955 #                -source_tag => 'location',
 956 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 957 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 958 #                                                  );
 959 #   $seq->add_SeqFeature($rootfeature);
 960
 961    while(my $b = shift @bs){
 962          my $e = shift @es;
 963          $seq->add_SeqFeature(
 964        Bio::SeqFeature::Generic->new(
 965          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 966          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 967          -source_tag => $self->source || 'MSA',
 968        )
 969      );
 970    }
 971
 972    return $seq;
 973 }
 974
 975
 976 =head1 Create new alignments
 977
 978 The result of these methods are horizontal or vertical subsets of the
 979 current MSA.
 980
 981 =head2 select
 982
 983  Title     : select
 984  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 985  Function  : Creates a new alignment from a continuous subset of
 986              sequences.  Numbering starts from 1.  Sequence positions
 987              larger than num_sequences() will thow an error.
 988  Returns   : a Bio::SimpleAlign object
 989  Args      : positive integer for the first sequence
 990              positive integer for the last sequence to include (optional)
 991
 992 =cut
 993
 994 sub select {
 995     my $self = shift;
 996     my ($start, $end) = @_;
 997
 998     $self->throw("Select start has to be a positive integer, not [$start]")
 999         unless $start =~ /^\d+$/ and $start > 0;
1000     $self->throw("Select end has to be a positive integer, not [$end]")
1001         unless $end  =~ /^\d+$/ and $end > 0;
1002     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003         unless $start <= $end;
1004
1005     my $aln = $self->new;
1006     foreach my $pos ($start .. $end) {
1007         $aln->add_seq($self->get_seq_by_pos($pos));
1008     }
1009     $aln->id($self->id);
1010     # fix for meta, sf, ann
1011     return $aln;
1012 }
1013
1014 =head2 select_noncont
1015
1016  Title     : select_noncont
1017  Usage     : # 1st and 3rd sequences, sorted
1018              $aln2 = $aln->select_noncont(1, 3)
1019
1020              # 1st and 3rd sequences, sorted (same as first)
1021              $aln2 = $aln->select_noncont(3, 1)
1022
1023              # 1st and 3rd sequences, unsorted
1024              $aln2 = $aln->select_noncont('nosort',3, 1)
1025
1026  Function  : Creates a new alignment from a subset of sequences.  Numbering
1027              starts from 1.  Sequence positions larger than num_sequences() will
1028              throw an error.  Sorts the order added to new alignment by default,
1029              to prevent sorting pass 'nosort' as the first argument in the list.
1030  Returns   : a Bio::SimpleAlign object
1031  Args      : array of integers for the sequences.  If the string 'nosort' is
1032              passed as the first argument, the sequences will not be sorted
1033              in the new alignment but will appear in the order listed.
1034
1035 =cut
1036
1037 sub select_noncont {
1038         my $self = shift;
1039     my $nosort = 0;
1040         my (@pos) = @_;
1041     if ($pos[0] !~ m{^\d+$}) {
1042         my $sortcmd = shift @pos;
1043         if ($sortcmd eq 'nosort') {
1044             $nosort = 1;
1045         } else {
1046             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1047         }
1048     }
1049
1050     my $end = $self->num_sequences;
1051     foreach ( @pos ) {
1052                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1054         }
1055
1056         @pos = sort {$a <=> $b} @pos unless $nosort;
1057
1058         my $aln = $self->new;
1059         foreach my $p (@pos) {
1060                 $aln->add_seq($self->get_seq_by_pos($p));
1061         }
1062         $aln->id($self->id);
1063     # fix for meta, sf, ann
1064         return $aln;
1065 }
1066
1067 =head2 slice
1068
1069  Title     : slice
1070  Usage     : $aln2 = $aln->slice(20,30)
1071  Function  : Creates a slice from the alignment inclusive of start and
1072              end columns, and the first column in the alignment is denoted 1.
1073              Sequences with no residues in the slice are excluded from the
1074              new alignment and a warning is printed. Slice beyond the length of
1075              the sequence does not do padding.
1076  Returns   : A Bio::SimpleAlign object
1077  Args      : Positive integer for start column, positive integer for end column,
1078              optional boolean which if true will keep gap-only columns in the newly
1079              created slice. Example:
1080
1081              $aln2 = $aln->slice(20,30,1)
1082
1083 =cut
1084
1085 sub slice {
1086         my $self = shift;
1087         my ($start, $end, $keep_gap_only) = @_;
1088
1089         $self->throw("Slice start has to be a positive integer, not [$start]")
1090           unless $start =~ /^\d+$/ and $start > 0;
1091         $self->throw("Slice end has to be a positive integer, not [$end]")
1092           unless $end =~ /^\d+$/ and $end > 0;
1093         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1094           unless $start <= $end;
1095         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1096                                          "[$start] is too big.") if $start > $self->length;
1097     my $cons_meta = $self->consensus_meta;
1098         my $aln = $self->new;
1099         $aln->id($self->id);
1100         foreach my $seq ( $self->each_seq() ) {
1101             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1102             Bio::Seq::Meta->new
1103         (-id      => $seq->id,
1104                  -alphabet => $seq->alphabet,
1105                  -strand  => $seq->strand,
1106                  -verbose => $self->verbose) :
1107             Bio::LocatableSeq->new
1108         (-id      => $seq->id,
1109                  -alphabet => $seq->alphabet,
1110                  -strand  => $seq->strand,
1111                  -verbose => $self->verbose);
1112
1113             # seq
1114             my $seq_end = $end;
1115             $seq_end = $seq->length if( $end > $seq->length );
1116
1117             my $slice_seq = $seq->subseq($start, $seq_end);
1118             $new_seq->seq( $slice_seq );
1119
1120             $slice_seq =~ s/\W//g;
1121
1122             if ($start > 1) {
1123             my $pre_start_seq = $seq->subseq(1, $start - 1);
1124             $pre_start_seq =~ s/\W//g;
1125             if (!defined($seq->strand)) {
1126                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1127             } elsif ($seq->strand < 0){
1128                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1129             } else {
1130                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1131             }
1132             } else {
1133             if ((defined $seq->strand)&&($seq->strand < 0)){
1134                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1135             } else {
1136                $new_seq->start( $seq->start);
1137             }
1138             }
1139         if ($new_seq->isa('Bio::Seq::MetaI')) {
1140             for my $meta_name ($seq->meta_names) {
1141                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1142             }
1143         }
1144             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1145
1146             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1147             $aln->add_seq($new_seq);
1148             } else {
1149             if( $keep_gap_only ) {
1150                 $aln->add_seq($new_seq);
1151             } else {
1152                 my $nse = $seq->get_nse();
1153                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1154                     " Sequence excluded from the new alignment.");
1155             }
1156             }
1157         }
1158     if ($cons_meta) {
1159         my $new = Bio::Seq::Meta->new();
1160         for my $meta_name ($cons_meta->meta_names) {
1161             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1162         }
1163         $aln->consensus_meta($new);
1164     }
1165     $aln->annotation($self->annotation);
1166     # fix for meta, sf, ann
1167         return $aln;
1168 }
1169
1170 =head2 remove_columns
1171
1172  Title     : remove_columns
1173  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1174              $aln2 = $aln->remove_columns([0,0],[6,8])
1175  Function  : Creates an aligment with columns removed corresponding to
1176              the specified type or by specifying the columns by number.
1177  Returns   : Bio::SimpleAlign object
1178  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1179              'all_gaps_columns') or array ref where the referenced array
1180              contains a pair of integers that specify a range.
1181              The first column is 0
1182
1183 =cut
1184
1185 sub remove_columns {
1186     my ($self,@args) = @_;
1187     @args || $self->throw("Must supply column ranges or column types");
1188     my $aln;
1189
1190     if ($args[0][0] =~ /^[a-z_]+$/i) {
1191         $aln = $self->_remove_columns_by_type($args[0]);
1192     } elsif ($args[0][0] =~ /^\d+$/) {
1193         $aln = $self->_remove_columns_by_num(\@args);
1194     } else {
1195         $self->throw("You must pass array references to remove_columns(), not @args");
1196     }
1197     # fix for meta, sf, ann
1198     $aln;
1199 }
1200
1201
1202 =head2 remove_gaps
1203
1204  Title     : remove_gaps
1205  Usage     : $aln2 = $aln->remove_gaps
1206  Function  : Creates an aligment with gaps removed
1207  Returns   : a Bio::SimpleAlign object
1208  Args      : a gap character(optional) if none specified taken
1209                 from $self->gap_char,
1210              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1211                         indicates that only all-gaps columns should be deleted
1212
1213 Used from method L<remove_columns> in most cases. Set gap character
1214 using L<gap_char()|gap_char>.
1215
1216 =cut
1217
1218 sub remove_gaps {
1219     my ($self,$gapchar,$all_gaps_columns) = @_;
1220     my $gap_line;
1221     if ($all_gaps_columns) {
1222         $gap_line = $self->all_gap_line($gapchar);
1223     } else {
1224         $gap_line = $self->gap_line($gapchar);
1225     }
1226     my $aln = $self->new;
1227
1228     my @remove;
1229     my $length = 0;
1230     my $del_char = $gapchar || $self->gap_char;
1231     # Do the matching to get the segments to remove
1232     while ($gap_line =~ m/[$del_char]/g) {
1233         my $start = pos($gap_line)-1;
1234         $gap_line=~/\G[$del_char]+/gc;
1235         my $end = pos($gap_line)-1;
1236
1237         #have to offset the start and end for subsequent removes
1238         $start-=$length;
1239         $end  -=$length;
1240         $length += ($end-$start+1);
1241         push @remove, [$start,$end];
1242     }
1243
1244     #remove the segments
1245     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1246     # fix for meta, sf, ann
1247     return $aln;
1248 }
1249
1250
1251 sub _remove_col {
1252     my ($self,$aln,$remove) = @_;
1253     my @new;
1254
1255     my $gap = $self->gap_char;
1256
1257     # splice out the segments and create new seq
1258     foreach my $seq($self->each_seq){
1259         my $new_seq = Bio::LocatableSeq->new(
1260                                              -id      => $seq->id,
1261                                              -alphabet=> $seq->alphabet,
1262                                              -strand  => $seq->strand,
1263                                              -verbose => $self->verbose);
1264         my $sequence = $seq->seq;
1265         foreach my $pair(@{$remove}){
1266             my $start = $pair->[0];
1267             my $end   = $pair->[1];
1268             $sequence = $seq->seq unless $sequence;
1269             my $orig = $sequence;
1270             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1271             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1272             $sequence = $head.$tail;
1273             # start
1274             unless (defined $new_seq->start) {
1275                 if ($start == 0) {
1276                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1277                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1278                 }
1279                 else {
1280                     my $start_adjust = $orig =~ /^$gap+/;
1281                     if ($start_adjust) {
1282                         $start_adjust = $+[0] == $start;
1283                     }
1284                     $new_seq->start($seq->start + $start_adjust);
1285                 }
1286             }
1287             # end
1288             if (($end + 1) >= CORE::length($orig)) {
1289                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1290                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1291             }
1292             else {
1293                 $new_seq->end($seq->end);
1294             }
1295         }
1296
1297         if ($new_seq->end < $new_seq->start) {
1298             # we removed all columns except for gaps: set to 0 to indicate no
1299             # sequence
1300             $new_seq->start(0);
1301             $new_seq->end(0);
1302         }
1303
1304         $new_seq->seq($sequence) if $sequence;
1305                 push @new, $new_seq;
1306     }
1307     # add the new seqs to the alignment
1308     foreach my $new(@new){
1309         $aln->add_seq($new);
1310     }
1311     # fix for meta, sf, ann
1312     return $aln;
1313 }
1314
1315 sub _remove_columns_by_type {
1316         my ($self,$type) = @_;
1317         my $aln = $self->new;
1318         my @remove;
1319
1320         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1321         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1322         my %matchchars = ( 'match'           => '\*',
1323                        'weak'             => '\.',
1324                        'strong'           => ':',
1325                        'mismatch'         => ' ',
1326                        'gaps'             => '',
1327                        'all_gaps_columns' => ''
1328                      );
1329         # get the characters to delete against
1330         my $del_char;
1331         foreach my $type (@{$type}){
1332                 $del_char.= $matchchars{$type};
1333         }
1334
1335         my $length = 0;
1336         my $match_line = $self->match_line;
1337         # do the matching to get the segments to remove
1338         if($del_char){
1339                 while($match_line =~ m/[$del_char]/g ){
1340                         my $start = pos($match_line)-1;
1341                         $match_line=~/\G[$del_char]+/gc;
1342                         my $end = pos($match_line)-1;
1343
1344                         #have to offset the start and end for subsequent removes
1345                         $start-=$length;
1346                         $end  -=$length;
1347                         $length += ($end-$start+1);
1348                         push @remove, [$start,$end];
1349                 }
1350         }
1351
1352         # remove the segments
1353         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1354         $aln = $aln->remove_gaps() if $gap;
1355         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1356     # fix for meta, sf, ann
1357         $aln;
1358 }
1359
1360
1361 sub _remove_columns_by_num {
1362         my ($self,$positions) = @_;
1363         my $aln = $self->new;
1364
1365         # sort the positions
1366         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1367
1368     my @remove;
1369     my $length = 0;
1370     foreach my $pos (@{$positions}) {
1371         my ($start, $end) = @{$pos};
1372
1373         #have to offset the start and end for subsequent removes
1374         $start-=$length;
1375         $end  -=$length;
1376         $length += ($end-$start+1);
1377         push @remove, [$start,$end];
1378     }
1379
1380     #remove the segments
1381     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1382     # fix for meta, sf, ann
1383         $aln;
1384 }
1385
1386
1387 =head1 Change sequences within the MSA
1388
1389 These methods affect characters in all sequences without changing the
1390 alignment.
1391
1392 =head2 splice_by_seq_pos
1393
1394  Title   : splice_by_seq_pos
1395  Usage   : $status = splice_by_seq_pos(1);
1396  Function: splices all aligned sequences where the specified sequence
1397            has gaps.
1398  Example :
1399  Returns : 1 on success
1400  Args    : position of sequence to splice by
1401
1402
1403 =cut
1404
1405 sub splice_by_seq_pos{
1406   my ($self,$pos) = @_;
1407
1408   my $guide = $self->get_seq_by_pos($pos);
1409   my $guide_seq = $guide->seq;
1410
1411   $guide_seq =~ s/\./\-/g;
1412
1413   my @gaps = ();
1414   $pos = -1;
1415   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1416     unshift @gaps, $pos;
1417     $pos++;
1418   }
1419
1420   foreach my $seq ($self->each_seq){
1421     my @bases = split '', $seq->seq;
1422
1423     splice(@bases, $_, 1) foreach @gaps;
1424     $seq->seq(join('', @bases));
1425   }
1426
1427   1;
1428 }
1429
1430 =head2 map_chars
1431
1432  Title     : map_chars
1433  Usage     : $ali->map_chars('\.','-')
1434  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1435              characters
1436
1437              Notice that the from (arg1) is interpretted as a regex,
1438              so be careful about quoting meta characters (eg
1439              $ali->map_chars('.','-') wont do what you want)
1440  Returns   :
1441  Argument  : 'from' rexexp
1442              'to' string
1443
1444 =cut
1445
1446 sub map_chars {
1447     my $self = shift;
1448     my $from = shift;
1449     my $to   = shift;
1450     my ($seq,$temp);
1451
1452     $self->throw("Need exactly two arguments")
1453         unless defined $from and defined $to;
1454
1455     foreach $seq ( $self->each_seq() ) {
1456         $temp = $seq->seq();
1457         $temp =~ s/$from/$to/g;
1458         $seq->seq($temp);
1459     }
1460     return 1;
1461 }
1462
1463
1464 =head2 uppercase
1465
1466  Title     : uppercase()
1467  Usage     : $ali->uppercase()
1468  Function  : Sets all the sequences to uppercase
1469  Returns   :
1470  Argument  :
1471
1472 =cut
1473
1474 sub uppercase {
1475     my $self = shift;
1476     my $seq;
1477     my $temp;
1478
1479     foreach $seq ( $self->each_seq() ) {
1480       $temp = $seq->seq();
1481       $temp =~ tr/[a-z]/[A-Z]/;
1482
1483       $seq->seq($temp);
1484     }
1485     return 1;
1486 }
1487
1488 =head2 cigar_line
1489
1490  Title    : cigar_line()
1491  Usage    : %cigars = $align->cigar_line()
1492  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1493             Report) line for each sequence in the alignment. Examples are
1494             "1,60" or "5,10:12,58", where the numbers refer to conserved
1495             positions within the alignment. The keys of the hash are the
1496             NSEs (name/start/end) assigned to each sequence.
1497  Args     : threshold (optional, defaults to 100)
1498  Returns  : Hash of strings (cigar lines)
1499
1500 =cut
1501
1502 sub cigar_line {
1503         my $self = shift;
1504         my $thr=shift||100;
1505         my %cigars;
1506
1507         my @consensus = split "",($self->consensus_string($thr));
1508         my $len = $self->length;
1509         my $gapchar = $self->gap_char;
1510
1511         # create a precursor, something like (1,4,5,6,7,33,45),
1512         # where each number corresponds to a conserved position
1513         foreach my $seq ( $self->each_seq ) {
1514                 my @seq = split "", uc ($seq->seq);
1515                 my $pos = 1;
1516                 for (my $x = 0 ; $x < $len ; $x++ ) {
1517                         if ($seq[$x] eq $consensus[$x]) {
1518                                 push @{$cigars{$seq->get_nse}},$pos;
1519                                 $pos++;
1520                         } elsif ($seq[$x] ne $gapchar) {
1521                                 $pos++;
1522                         }
1523                 }
1524         }
1525         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1526         for my $name (keys %cigars) {
1527                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1528                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1529       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1530            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1531                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1532                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1533                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1534                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1535                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1536                         }
1537       }
1538         }
1539   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1540   for my $name (keys %cigars) {
1541           my @remove;
1542           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1543                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1544                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1545                       unshift @remove,$x;
1546               }
1547            }
1548       for my $pos (@remove) {
1549                         splice @{$cigars{$name}}, $pos, 1;
1550            }
1551    }
1552    # join and punctuate
1553    for my $name (keys %cigars) {
1554           my ($start,$end,$str) = "";
1555           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1556                   $str .= ($start . "," . $end . ":");
1557           }
1558           $str =~ s/:$//;
1559       $cigars{$name} = $str;
1560    }
1561    %cigars;
1562 }
1563
1564
1565 =head2 match_line
1566
1567  Title    : match_line()
1568  Usage    : $line = $align->match_line()
1569  Function : Generates a match line - much like consensus string
1570             except that a line indicating the '*' for a match.
1571  Args     : (optional) Match line characters ('*' by default)
1572             (optional) Strong match char (':' by default)
1573             (optional) Weak match char ('.' by default)
1574  Returns  : String
1575
1576 =cut
1577
1578 sub match_line {
1579         my ($self,$matchlinechar, $strong, $weak) = @_;
1580         my %matchchars = ('match'    => $matchlinechar || '*',
1581                                                           'weak'     => $weak          || '.',
1582                                                           'strong'   => $strong        || ':',
1583                                                           'mismatch' => ' ',
1584                                                   );
1585
1586         my @seqchars;
1587         my $alphabet;
1588         foreach my $seq ( $self->each_seq ) {
1589                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1590                 $alphabet = $seq->alphabet unless defined $alphabet;
1591         }
1592         my $refseq = shift @seqchars;
1593         # let's just march down the columns
1594         my $matchline;
1595  POS:
1596         foreach my $pos ( 0..$self->length ) {
1597                 my $refchar = $refseq->[$pos];
1598                 my $char = $matchchars{'mismatch'};
1599                 unless( defined $refchar ) {
1600                         last if $pos == $self->length; # short circuit on last residue
1601                         # this in place to handle jason's soon-to-be-committed
1602                         # intron mapping code
1603                         goto bottom;
1604                 }
1605                 my %col = ($refchar => 1);
1606                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1607                 foreach my $seq ( @seqchars ) {
1608                         next if $pos >= scalar @$seq;
1609                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1610                                                           $seq->[$pos] eq ' ' );
1611                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1612                 }
1613                 my @colresidues = sort keys %col;
1614
1615                 # if all the values are the same
1616                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1617                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1618                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1619                         # matches for protein seqs
1620             TYPE:
1621                         foreach my $type ( qw(strong weak) ) {
1622                                 # iterate through categories
1623                                 my %groups;
1624                                 # iterate through each of the aa in the col
1625                                 # look to see which groups it is in
1626                                 foreach my $c ( @colresidues ) {
1627                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1628                                                 push @{$groups{$f}},$c;
1629                                         }
1630                                 }
1631                          GRP:
1632                                 foreach my $cols ( values %groups ) {
1633                                         @$cols = sort @$cols;
1634                                         # now we are just testing to see if two arrays
1635                                         # are identical w/o changing either one
1636                                         # have to be same len
1637                                         next if( scalar @$cols != scalar @colresidues );
1638                                         # walk down the length and check each slot
1639                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1640                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1641                                         }
1642                                         $char = $matchchars{$type};
1643                                         last TYPE;
1644                                 }
1645                         }
1646                 }
1647          bottom:
1648                 $matchline .= $char;
1649         }
1650         return $matchline;
1651 }
1652
1653
1654 =head2 gap_line
1655
1656  Title    : gap_line()
1657  Usage    : $line = $align->gap_line()
1658  Function : Generates a gap line - much like consensus string
1659             except that a line where '-' represents gap
1660  Args     : (optional) gap line characters ('-' by default)
1661  Returns  : string
1662
1663 =cut
1664
1665 sub gap_line {
1666     my ($self,$gapchar) = @_;
1667     $gapchar = $gapchar || $self->gap_char;
1668     my %gap_hsh; # column gaps vector
1669     foreach my $seq ( $self->each_seq ) {
1670                 my $i = 0;
1671         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1672                   map {[$i++, $_]} split(//, uc ($seq->seq));
1673     }
1674     my $gap_line;
1675     foreach my $pos ( 0..$self->length-1 ) {
1676           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1677     }
1678     return $gap_line;
1679 }
1680
1681 =head2 all_gap_line
1682
1683  Title    : all_gap_line()
1684  Usage    : $line = $align->all_gap_line()
1685  Function : Generates a gap line - much like consensus string
1686             except that a line where '-' represents all-gap column
1687  Args     : (optional) gap line characters ('-' by default)
1688  Returns  : string
1689
1690 =cut
1691
1692 sub all_gap_line {
1693     my ($self,$gapchar) = @_;
1694     $gapchar = $gapchar || $self->gap_char;
1695     my %gap_hsh;                # column gaps counter hash
1696     my @seqs = $self->each_seq;
1697     foreach my $seq ( @seqs ) {
1698         my $i = 0;
1699         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1700         map {[$i++, $_]} split(//, uc ($seq->seq));
1701     }
1702     my $gap_line;
1703     foreach my $pos ( 0..$self->length-1 ) {
1704         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1705             # gaps column
1706             $gap_line .= $gapchar;
1707         } else {
1708             $gap_line .= '.';
1709         }
1710     }
1711     return $gap_line;
1712 }
1713
1714 =head2 gap_col_matrix
1715
1716  Title    : gap_col_matrix()
1717  Usage    : my $cols = $align->gap_col_matrix()
1718  Function : Generates an array of hashes where
1719             each entry in the array is a hash reference
1720             with keys of all the sequence names and
1721             and value of 1 or 0 if the sequence has a gap at that column
1722  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1723
1724 =cut
1725
1726 sub gap_col_matrix {
1727     my ($self,$gapchar) = @_;
1728     $gapchar = $gapchar || $self->gap_char;
1729     my %gap_hsh; # column gaps vector
1730     my @cols;
1731     foreach my $seq ( $self->each_seq ) {
1732         my $i = 0;
1733         my $str = $seq->seq;
1734         my $len = $seq->length;
1735         my $ch;
1736         my $id = $seq->display_id;
1737         while( $i < $len ) {
1738             $ch = substr($str, $i, 1);
1739             $cols[$i++]->{$id} = ($ch eq $gapchar);
1740         }
1741     }
1742     return \@cols;
1743 }
1744
1745 =head2 match
1746
1747  Title     : match()
1748  Usage     : $ali->match()
1749  Function  : Goes through all columns and changes residues that are
1750              identical to residue in first sequence to match '.'
1751              character. Sets match_char.
1752
1753              USE WITH CARE: Most MSA formats do not support match
1754              characters in sequences, so this is mostly for output
1755              only. NEXUS format (Bio::AlignIO::nexus) can handle
1756              it.
1757  Returns   : 1
1758  Argument  : a match character, optional, defaults to '.'
1759
1760 =cut
1761
1762 sub match {
1763     my ($self, $match) = @_;
1764
1765     $match ||= '.';
1766     my ($matching_char) = $match;
1767     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1768     $self->map_chars($matching_char, '-');
1769
1770     my @seqs = $self->each_seq();
1771     return 1 unless scalar @seqs > 1;
1772
1773     my $refseq = shift @seqs ;
1774     my @refseq = split //, $refseq->seq;
1775     my $gapchar = $self->gap_char;
1776
1777     foreach my $seq ( @seqs ) {
1778         my @varseq = split //, $seq->seq();
1779         for ( my $i=0; $i < scalar @varseq; $i++) {
1780             $varseq[$i] = $match if defined $refseq[$i] &&
1781                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1782                   $refseq[$i] =~ /$gapchar/ )
1783                       && $refseq[$i] eq $varseq[$i];
1784         }
1785         $seq->seq(join '', @varseq);
1786     }
1787     $self->match_char($match);
1788     return 1;
1789 }
1790
1791
1792 =head2 unmatch
1793
1794  Title     : unmatch()
1795  Usage     : $ali->unmatch()
1796  Function  : Undoes the effect of method match. Unsets match_char.
1797  Returns   : 1
1798  Argument  : a match character, optional, defaults to '.'
1799
1800 See L<match> and L<match_char>
1801
1802 =cut
1803
1804 sub unmatch {
1805     my ($self, $match) = @_;
1806
1807     $match ||= '.';
1808
1809     my @seqs = $self->each_seq();
1810     return 1 unless scalar @seqs > 1;
1811
1812     my $refseq = shift @seqs ;
1813     my @refseq = split //, $refseq->seq;
1814     my $gapchar = $self->gap_char;
1815     foreach my $seq ( @seqs ) {
1816         my @varseq = split //, $seq->seq();
1817         for ( my $i=0; $i < scalar @varseq; $i++) {
1818             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1819                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1820                   $refseq[$i] =~ /$gapchar/ ) &&
1821                       $varseq[$i] eq $match;
1822         }
1823         $seq->seq(join '', @varseq);
1824     }
1825     $self->match_char('');
1826     return 1;
1827 }
1828
1829 =head1 MSA attributes
1830
1831 Methods for setting and reading the MSA attributes.
1832
1833 Note that the methods defining character semantics depend on the user
1834 to set them sensibly.  They are needed only by certain input/output
1835 methods. Unset them by setting to an empty string ('').
1836
1837 =head2 id
1838
1839  Title     : id
1840  Usage     : $myalign->id("Ig")
1841  Function  : Gets/sets the id field of the alignment
1842  Returns   : An id string
1843  Argument  : An id string (optional)
1844
1845 =cut
1846
1847 sub id {
1848     my ($self, $name) = @_;
1849
1850     if (defined( $name )) {
1851         $self->{'_id'} = $name;
1852     }
1853
1854     return $self->{'_id'};
1855 }
1856
1857 =head2 accession
1858
1859  Title     : accession
1860  Usage     : $myalign->accession("PF00244")
1861  Function  : Gets/sets the accession field of the alignment
1862  Returns   : An acc string
1863  Argument  : An acc string (optional)
1864
1865 =cut
1866
1867 sub accession {
1868     my ($self, $acc) = @_;
1869
1870     if (defined( $acc )) {
1871         $self->{'_accession'} = $acc;
1872     }
1873
1874     return $self->{'_accession'};
1875 }
1876
1877 =head2 description
1878
1879  Title     : description
1880  Usage     : $myalign->description("14-3-3 proteins")
1881  Function  : Gets/sets the description field of the alignment
1882  Returns   : An description string
1883  Argument  : An description string (optional)
1884
1885 =cut
1886
1887 sub description {
1888     my ($self, $name) = @_;
1889
1890     if (defined( $name )) {
1891         $self->{'_description'} = $name;
1892     }
1893
1894     return $self->{'_description'};
1895 }
1896
1897 =head2 missing_char
1898
1899  Title     : missing_char
1900  Usage     : $myalign->missing_char("?")
1901  Function  : Gets/sets the missing_char attribute of the alignment
1902              It is generally recommended to set it to 'n' or 'N'
1903              for nucleotides and to 'X' for protein.
1904  Returns   : An missing_char string,
1905  Argument  : An missing_char string (optional)
1906
1907 =cut
1908
1909 sub missing_char {
1910     my ($self, $char) = @_;
1911
1912     if (defined $char ) {
1913         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1914         $self->{'_missing_char'} = $char;
1915     }
1916
1917     return $self->{'_missing_char'};
1918 }
1919
1920 =head2 match_char
1921
1922  Title     : match_char
1923  Usage     : $myalign->match_char('.')
1924  Function  : Gets/sets the match_char attribute of the alignment
1925  Returns   : An match_char string,
1926  Argument  : An match_char string (optional)
1927
1928 =cut
1929
1930 sub match_char {
1931     my ($self, $char) = @_;
1932
1933     if (defined $char ) {
1934         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1935         $self->{'_match_char'} = $char;
1936     }
1937
1938     return $self->{'_match_char'};
1939 }
1940
1941 =head2 gap_char
1942
1943  Title     : gap_char
1944  Usage     : $myalign->gap_char('-')
1945  Function  : Gets/sets the gap_char attribute of the alignment
1946  Returns   : An gap_char string, defaults to '-'
1947  Argument  : An gap_char string (optional)
1948
1949 =cut
1950
1951 sub gap_char {
1952     my ($self, $char) = @_;
1953
1954     if (defined $char || ! defined $self->{'_gap_char'} ) {
1955         $char= '-' unless defined $char;
1956         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1957         $self->{'_gap_char'} = $char;
1958     }
1959     return $self->{'_gap_char'};
1960 }
1961
1962 =head2 symbol_chars
1963
1964  Title   : symbol_chars
1965  Usage   : my @symbolchars = $aln->symbol_chars;
1966  Function: Returns all the seen symbols (other than gaps)
1967  Returns : array of characters that are the seen symbols
1968  Args    : boolean to include the gap/missing/match characters
1969
1970 =cut
1971
1972 sub symbol_chars{
1973    my ($self,$includeextra) = @_;
1974
1975    unless ($self->{'_symbols'}) {
1976        foreach my $seq ($self->each_seq) {
1977            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1978        }
1979    }
1980    my %copy = %{$self->{'_symbols'}};
1981    if( ! $includeextra ) {
1982        foreach my $char ( $self->gap_char, $self->match_char,
1983                           $self->missing_char) {
1984            delete $copy{$char} if( defined $char );
1985        }
1986    }
1987    return keys %copy;
1988 }
1989
1990 =head1 Alignment descriptors
1991
1992 These read only methods describe the MSA in various ways.
1993
1994
1995 =head2 score
1996
1997  Title     : score
1998  Usage     : $str = $ali->score()
1999  Function  : get/set a score of the alignment
2000  Returns   : a score for the alignment
2001  Argument  : an optional score to set
2002
2003 =cut
2004
2005 sub score {
2006   my $self = shift;
2007   $self->{score} = shift if @_;
2008   return $self->{score};
2009 }
2010
2011 =head2 consensus_string
2012
2013  Title     : consensus_string
2014  Usage     : $str = $ali->consensus_string($threshold_percent)
2015  Function  : Makes a strict consensus
2016  Returns   : Consensus string
2017  Argument  : Optional threshold ranging from 0 to 100.
2018              The consensus residue has to appear at least threshold %
2019              of the sequences at a given location, otherwise a '?'
2020              character will be placed at that location.
2021              (Default value = 0%)
2022
2023 =cut
2024
2025 sub consensus_string {
2026     my $self = shift;
2027     my $threshold = shift;
2028
2029     my $out = "";
2030     my $len = $self->length - 1;
2031
2032     foreach ( 0 .. $len ) {
2033         $out .= $self->_consensus_aa($_,$threshold);
2034     }
2035     return $out;
2036 }
2037
2038 =head2 consensus_conservation
2039
2040  Title     : consensus_conservation
2041  Usage     : @conservation = $ali->consensus_conservation();
2042  Function  : Conservation (as a percent) of each position of alignment
2043  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2044  Argument  :
2045
2046 =cut
2047
2048 sub consensus_conservation {
2049     my $self = shift;
2050     my @cons;
2051     my $num_sequences = $self->num_sequences;
2052     foreach my $point (0..$self->length-1) {
2053         my %hash = $self->_consensus_counts($point);
2054         # max frequency of a non-gap letter
2055         my $max = (sort {$b<=>$a} values %hash )[0];
2056         push @cons, 100 * $max / $num_sequences;
2057     }
2058     return @cons;
2059 }
2060
2061 sub _consensus_aa {
2062     my $self = shift;
2063     my $point = shift;
2064     my $threshold_percent = shift || -1 ;
2065     my ($seq,%hash,$count,$letter,$key);
2066     my $gapchar = $self->gap_char;
2067     %hash = $self->_consensus_counts($point);
2068     my $number_of_sequences = $self->num_sequences();
2069     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2070     $count = -1;
2071     $letter = '?';
2072
2073     foreach $key ( sort keys %hash ) {
2074         # print "Now at $key $hash{$key}\n";
2075         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2076             $letter = $key;
2077             $count = $hash{$key};
2078         }
2079     }
2080     return $letter;
2081 }
2082
2083 # Frequency of each letter in one column
2084 sub _consensus_counts {
2085     my $self = shift;
2086     my $point = shift;
2087     my %hash;
2088     my $gapchar = $self->gap_char;
2089     foreach my $seq ( $self->each_seq() ) {
2090         my $letter = substr($seq->seq,$point,1);
2091         $self->throw("--$point-----------") if $letter eq '';
2092         ($letter eq $gapchar || $letter =~ /\./) && next;
2093         $hash{$letter}++;
2094     }
2095     return %hash;
2096 }
2097
2098
2099 =head2 consensus_iupac
2100
2101  Title     : consensus_iupac
2102  Usage     : $str = $ali->consensus_iupac()
2103  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2104              and RNA. The output is in upper case except when gaps in
2105              a column force output to be in lower case.
2106
2107              Note that if your alignment sequences contain a lot of
2108              IUPAC ambiquity codes you often have to manually set
2109              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2110              indicate a protein sequence.
2111  Returns   : consensus string
2112  Argument  : none
2113  Throws    : on protein sequences
2114
2115 =cut
2116
2117 sub consensus_iupac {
2118     my $self = shift;
2119     my $out = "";
2120     my $len = $self->length-1;
2121
2122     # only DNA and RNA sequences are valid
2123     foreach my $seq ( $self->each_seq() ) {
2124         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2125             if $seq->alphabet eq 'protein';
2126     }
2127     # loop over the alignment columns
2128     foreach my $count ( 0 .. $len ) {
2129         $out .= $self->_consensus_iupac($count);
2130     }
2131     return $out;
2132 }
2133
2134 sub _consensus_iupac {
2135     my ($self, $column) = @_;
2136     my ($string, $char, $rna);
2137
2138     #determine all residues in a column
2139     foreach my $seq ( $self->each_seq() ) {
2140         $string .= substr($seq->seq, $column, 1);
2141     }
2142     $string = uc $string;
2143
2144     # quick exit if there's an N in the string
2145     if ($string =~ /N/) {
2146         $string =~ /\W/ ? return 'n' : return 'N';
2147     }
2148     # ... or if there are only gap characters
2149     return '-' if $string =~ /^\W+$/;
2150
2151     # treat RNA as DNA in regexps
2152     if ($string =~ /U/) {
2153         $string =~ s/U/T/;
2154         $rna = 1;
2155     }
2156
2157     # the following s///'s only need to be done to the _first_ ambiguity code
2158     # as we only need to see the _range_ of characters in $string
2159
2160     if ($string =~ /[VDHB]/) {
2161         $string =~ s/V/AGC/;
2162         $string =~ s/D/AGT/;
2163         $string =~ s/H/ACT/;
2164         $string =~ s/B/CTG/;
2165     }
2166
2167     if ($string =~ /[SKYRWM]/) {
2168         $string =~ s/S/GC/;
2169         $string =~ s/K/GT/;
2170         $string =~ s/Y/CT/;
2171         $string =~ s/R/AG/;
2172         $string =~ s/W/AT/;
2173         $string =~ s/M/AC/;
2174     }
2175
2176     # and now the guts of the thing
2177
2178     if ($string =~ /A/) {
2179         $char = 'A';                     # A                      A
2180         if ($string =~ /G/) {
2181             $char = 'R';                 # A and G (purines)      R
2182             if ($string =~ /C/) {
2183                 $char = 'V';             # A and G and C          V
2184                 if ($string =~ /T/) {
2185                     $char = 'N';         # A and G and C and T    N
2186                 }
2187             } elsif ($string =~ /T/) {
2188                 $char = 'D';             # A and G and T          D
2189             }
2190         } elsif ($string =~ /C/) {
2191             $char = 'M';                 # A and C                M
2192             if ($string =~ /T/) {
2193                 $char = 'H';             # A and C and T          H
2194             }
2195         } elsif ($string =~ /T/) {
2196             $char = 'W';                 # A and T                W
2197         }
2198     } elsif ($string =~ /C/) {
2199         $char = 'C';                     # C                      C
2200         if ($string =~ /T/) {
2201             $char = 'Y';                 # C and T (pyrimidines)  Y
2202             if ($string =~ /G/) {
2203                 $char = 'B';             # C and T and G          B
2204             }
2205         } elsif ($string =~ /G/) {
2206             $char = 'S';                 # C and G                S
2207         }
2208     } elsif ($string =~ /G/) {
2209         $char = 'G';                     # G                      G
2210         if ($string =~ /C/) {
2211             $char = 'S';                 # G and C                S
2212         } elsif ($string =~ /T/) {
2213             $char = 'K';                 # G and T                K
2214         }
2215     } elsif ($string =~ /T/) {
2216         $char = 'T';                     # T                      T
2217     }
2218
2219     $char = 'U' if $rna and $char eq 'T';
2220     $char = lc $char if $string =~ /\W/;
2221
2222     return $char;
2223 }
2224
2225
2226 =head2 consensus_meta
2227
2228  Title     : consensus_meta
2229  Usage     : $seqmeta = $ali->consensus_meta()
2230  Function  : Returns a Bio::Seq::Meta object containing the consensus
2231              strings derived from meta data analysis.
2232  Returns   : Bio::Seq::Meta
2233  Argument  : Bio::Seq::Meta
2234  Throws    : non-MetaI object
2235
2236 =cut
2237
2238 sub consensus_meta {
2239     my ($self, $meta) = @_;
2240     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2241         $self->throw('Not a Bio::Seq::MetaI object');
2242     }
2243     return $self->{'_aln_meta'} = $meta if $meta;
2244     return $self->{'_aln_meta'}
2245 }
2246
2247 =head2 is_flush
2248
2249  Title     : is_flush
2250  Usage     : if ( $ali->is_flush() )
2251  Function  : Tells you whether the alignment
2252            : is flush, i.e. all of the same length
2253  Returns   : 1 or 0
2254  Argument  :
2255
2256 =cut
2257
2258 sub is_flush {
2259     my ($self,$report) = @_;
2260     my $seq;
2261     my $length = (-1);
2262     my $temp;
2263
2264     foreach $seq ( $self->each_seq() ) {
2265         if( $length == (-1) ) {
2266             $length = CORE::length($seq->seq());
2267             next;
2268         }
2269
2270         $temp = CORE::length($seq->seq());
2271         if( $temp != $length ) {
2272             $self->warn("expecting $length not $temp from ".
2273                         $seq->display_id) if( $report );
2274             $self->debug("expecting $length not $temp from ".
2275                          $seq->display_id);
2276             $self->debug($seq->seq(). "\n");
2277             return 0;
2278         }
2279     }
2280
2281     return 1;
2282 }
2283
2284
2285 =head2 length
2286
2287  Title     : length()
2288  Usage     : $len = $ali->length()
2289  Function  : Returns the maximum length of the alignment.
2290              To be sure the alignment is a block, use is_flush
2291  Returns   : Integer
2292  Argument  :
2293
2294 =cut
2295
2296 sub length_aln {
2297     my $self = shift;
2298     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2299     $self->length(@_);
2300 }
2301
2302 sub length {
2303     my $self = shift;
2304     my $seq;
2305     my $length = -1;
2306     my $temp;
2307
2308     foreach $seq ( $self->each_seq() ) {
2309         $temp = $seq->length();
2310         if( $temp > $length ) {
2311             $length = $temp;
2312         }
2313     }
2314
2315     return $length;
2316 }
2317
2318
2319 =head2 maxdisplayname_length
2320
2321  Title     : maxdisplayname_length
2322  Usage     : $ali->maxdisplayname_length()
2323  Function  : Gets the maximum length of the displayname in the
2324              alignment. Used in writing out various MSA formats.
2325  Returns   : integer
2326  Argument  :
2327
2328 =cut
2329
2330 sub maxname_length {
2331     my $self = shift;
2332     $self->deprecated("maxname_length - deprecated method.".
2333                       " Use maxdisplayname_length() instead.");
2334     $self->maxdisplayname_length();
2335 }
2336
2337 sub maxnse_length {
2338     my $self = shift;
2339     $self->deprecated("maxnse_length - deprecated method.".
2340                       " Use maxnse_length() instead.");
2341     $self->maxdisplayname_length();
2342 }
2343
2344 sub maxdisplayname_length {
2345     my $self = shift;
2346     my $maxname = (-1);
2347     my ($seq,$len);
2348
2349     foreach $seq ( $self->each_seq() ) {
2350         $len = CORE::length $self->displayname($seq->get_nse());
2351
2352         if( $len > $maxname ) {
2353             $maxname = $len;
2354         }
2355     }
2356
2357     return $maxname;
2358 }
2359
2360 =head2 max_metaname_length
2361
2362  Title     : max_metaname_length
2363  Usage     : $ali->max_metaname_length()
2364  Function  : Gets the maximum length of the meta name tags in the
2365              alignment for the sequences and for the alignment.
2366              Used in writing out various MSA formats.
2367  Returns   : integer
2368  Argument  : None
2369
2370 =cut
2371
2372 sub max_metaname_length {
2373     my $self = shift;
2374     my $maxname = (-1);
2375     my ($seq,$len);
2376
2377     # check seq meta first
2378     for $seq ( $self->each_seq() ) {
2379         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2380         for my $mtag ($seq->meta_names) {
2381             $len = CORE::length $mtag;
2382             if( $len > $maxname ) {
2383                 $maxname = $len;
2384             }
2385         }
2386     }
2387
2388     # alignment meta
2389     for my $meta ($self->consensus_meta) {
2390         next unless $meta;
2391         for my $name ($meta->meta_names) {
2392             $len = CORE::length $name;
2393             if( $len > $maxname ) {
2394                 $maxname = $len;
2395             }
2396         }
2397     }
2398
2399     return $maxname;
2400 }
2401
2402 =head2 num_residues
2403
2404  Title     : num_residues
2405  Usage     : $no = $ali->num_residues
2406  Function  : number of residues in total in the alignment
2407  Returns   : integer
2408  Argument  :
2409  Note      : replaces no_residues()
2410
2411 =cut
2412
2413 sub num_residues {
2414     my $self = shift;
2415     my $count = 0;
2416
2417     foreach my $seq ($self->each_seq) {
2418         my $str = $seq->seq();
2419
2420         $count += ($str =~ s/[A-Za-z]//g);
2421     }
2422
2423     return $count;
2424 }
2425
2426 =head2 num_sequences
2427
2428  Title     : num_sequences
2429  Usage     : $depth = $ali->num_sequences
2430  Function  : number of sequence in the sequence alignment
2431  Returns   : integer
2432  Argument  : none
2433  Note      : replaces no_sequences()
2434
2435 =cut
2436
2437 sub num_sequences {
2438     my $self = shift;
2439     return scalar($self->each_seq);
2440 }
2441
2442
2443 =head2 average_percentage_identity
2444
2445  Title   : average_percentage_identity
2446  Usage   : $id = $align->average_percentage_identity
2447  Function: The function uses a fast method to calculate the average
2448            percentage identity of the alignment
2449  Returns : The average percentage identity of the alignment
2450  Args    : None
2451  Notes   : This method implemented by Kevin Howe calculates a figure that is
2452            designed to be similar to the average pairwise identity of the
2453            alignment (identical in the absence of gaps), without having to
2454            explicitly calculate pairwise identities proposed by Richard Durbin.
2455            Validated by Ewan Birney ad Alex Bateman.
2456
2457 =cut
2458
2459 sub average_percentage_identity{
2460    my ($self,@args) = @_;
2461
2462    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2463                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2464
2465    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2466
2467    if (! $self->is_flush()) {
2468        $self->throw("All sequences in the alignment must be the same length");
2469    }
2470
2471    @seqs = $self->each_seq();
2472    $len = $self->length();
2473
2474    # load the each hash with correct keys for existence checks
2475
2476    for( my $index=0; $index < $len; $index++) {
2477        foreach my $letter (@alphabet) {
2478            $countHashes[$index]->{$letter} = 0;
2479        }
2480    }
2481    foreach my $seq (@seqs)  {
2482        my @seqChars = split //, $seq->seq();
2483        for( my $column=0; $column < @seqChars; $column++ ) {
2484            my $char = uc($seqChars[$column]);
2485            if (exists $countHashes[$column]->{$char}) {
2486                $countHashes[$column]->{$char}++;
2487            }
2488        }
2489    }
2490
2491    $total = 0;
2492    $divisor = 0;
2493    for(my $column =0; $column < $len; $column++) {
2494        my %hash = %{$countHashes[$column]};
2495        $subdivisor = 0;
2496        foreach my $res (keys %hash) {
2497            $total += $hash{$res}*($hash{$res} - 1);
2498            $subdivisor += $hash{$res};
2499        }
2500        $divisor += $subdivisor * ($subdivisor - 1);
2501    }
2502    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2503 }
2504
2505 =head2 percentage_identity
2506
2507  Title   : percentage_identity
2508  Usage   : $id = $align->percentage_identity
2509  Function: The function calculates the average percentage identity
2510            (aliased to average_percentage_identity)
2511  Returns : The average percentage identity
2512  Args    : None
2513
2514 =cut
2515
2516 sub percentage_identity {
2517     my $self = shift;
2518     return $self->average_percentage_identity();
2519 }
2520
2521 =head2 overall_percentage_identity
2522
2523  Title   : overall_percentage_identity
2524  Usage   : $id = $align->overall_percentage_identity
2525            $id = $align->overall_percentage_identity('short')
2526  Function: The function calculates the percentage identity of
2527            the conserved columns
2528  Returns : The percentage identity of the conserved columns
2529  Args    : length value to use, optional defaults to alignment length
2530                  possible values: 'align', 'short', 'long'
2531
2532 The argument values 'short' and 'long' refer to shortest and longest
2533 sequence in the alignment. Method modification code by Hongyu Zhang.
2534
2535 =cut
2536
2537 sub overall_percentage_identity{
2538    my ($self, $length_measure) = @_;
2539
2540    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2541
2542    my %enum = map {$_ => undef} qw (align short long);
2543
2544    $self->throw("Unknown argument [$length_measure]")
2545        if $length_measure and not exists $enum{$length_measure};
2546    $length_measure ||= 'align';
2547
2548    if (! $self->is_flush()) {
2549        $self->throw("All sequences in the alignment must be the same length");
2550    }
2551
2552    # Count the residues seen at each position
2553    my $len;
2554    my $total = 0; # number of positions with identical residues
2555    my @countHashes;
2556    my @seqs = $self->each_seq;
2557    my $nof_seqs = scalar @seqs;
2558    my $aln_len = $self->length();
2559    for my $seq (@seqs)  {
2560        my $seqstr = $seq->seq;
2561
2562        # Count residues for given sequence
2563        for my $column (0 .. $aln_len-1) {
2564            my $char = uc( substr($seqstr, $column, 1) );
2565            if ( exists $alphabet{$char} ) {
2566
2567                # This is a valid char
2568                if ( defined $countHashes[$column]->{$char} ) {
2569                  $countHashes[$column]->{$char}++;
2570                } else {
2571                  $countHashes[$column]->{$char} = 1;
2572                }
2573
2574                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2575                    # All sequences have this same residue
2576                    $total++;
2577                }
2578
2579            }
2580        }
2581
2582        # Sequence length
2583        if ($length_measure eq 'short' || $length_measure eq 'long') {
2584            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2585            if ($length_measure eq 'short') {
2586                if ( (not defined $len) || ($seq_len < $len) ) {
2587                    $len = $seq_len;
2588                }
2589            } elsif ($length_measure eq 'long') {
2590                if ( (not defined $len) || ($seq_len > $len) ) {
2591                    $len = $seq_len;
2592                }
2593            }
2594        }
2595
2596    }
2597
2598    if ($length_measure eq 'align') {
2599        $len = $aln_len;
2600    }
2601
2602    return ($total / $len ) * 100.0;
2603 }
2604
2605
2606
2607 =head1 Alignment positions
2608
2609 Methods to map a sequence position into an alignment column and back.
2610 column_from_residue_number() does the former. The latter is really a
2611 property of the sequence object and can done using
2612 L<Bio::LocatableSeq::location_from_column>:
2613
2614     # select somehow a sequence from the alignment, e.g.
2615     my $seq = $aln->get_seq_by_pos(1);
2616     #$loc is undef or Bio::LocationI object
2617     my $loc = $seq->location_from_column(5);
2618
2619 =head2 column_from_residue_number
2620
2621  Title   : column_from_residue_number
2622  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2623  Function: This function gives the position in the alignment
2624            (i.e. column number) of the given residue number in the
2625            sequence with the given name. For example, for the
2626            alignment
2627
2628              Seq1/91-97 AC..DEF.GH.
2629              Seq2/24-30 ACGG.RTY...
2630                 Seq3/43-51 AC.DDEF.GHI
2631
2632            column_from_residue_number( "Seq1", 94 ) returns 6.
2633            column_from_residue_number( "Seq2", 25 ) returns 2.
2634            column_from_residue_number( "Seq3", 50 ) returns 10.
2635
2636            An exception is thrown if the residue number would lie
2637            outside the length of the aligment
2638            (e.g. column_from_residue_number( "Seq2", 22 )
2639
2640           Note: If the the parent sequence is represented by more than
2641                 one alignment sequence and the residue number is present in
2642                 them, this method finds only the first one.
2643
2644  Returns : A column number for the position in the alignment of the
2645            given residue in the given sequence (1 = first column)
2646  Args    : A sequence id/name (not a name/start-end)
2647            A residue number in the whole sequence (not just that
2648            segment of it in the alignment)
2649
2650 =cut
2651
2652 sub column_from_residue_number {
2653     my ($self, $name, $resnumber) = @_;
2654
2655     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2656     $self->throw("Second argument residue number missing") unless $resnumber;
2657
2658     foreach my $seq ($self->each_seq_with_id($name)) {
2659         my $col;
2660         eval {
2661             $col = $seq->column_from_residue_number($resnumber);
2662         };
2663         next if $@;
2664         return $col;
2665     }
2666
2667     $self->throw("Could not find a sequence segment in $name ".
2668                  "containing residue number $resnumber");
2669
2670 }
2671
2672 =head1 Sequence names
2673
2674 Methods to manipulate the display name. The default name based on the
2675 sequence id and subsequence positions can be overridden in various
2676 ways.
2677
2678 =head2 displayname
2679
2680  Title     : displayname
2681  Usage     : $myalign->displayname("Ig", "IgA")
2682  Function  : Gets/sets the display name of a sequence in the alignment
2683  Returns   : A display name string
2684  Argument  : name of the sequence
2685              displayname of the sequence (optional)
2686
2687 =cut
2688
2689 sub get_displayname {
2690     my $self = shift;
2691     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2692     $self->displayname(@_);
2693 }
2694
2695 sub set_displayname {
2696     my $self = shift;
2697     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2698     $self->displayname(@_);
2699 }
2700
2701 sub displayname {
2702     my ($self, $name, $disname) = @_;
2703
2704     $self->throw("No sequence with name [$name]")
2705         unless defined $self->{'_seq'}->{$name};
2706
2707     if(  $disname and  $name) {
2708         $self->{'_dis_name'}->{$name} = $disname;
2709         return $disname;
2710     }
2711     elsif( defined $self->{'_dis_name'}->{$name} ) {
2712         return  $self->{'_dis_name'}->{$name};
2713     } else {
2714         return $name;
2715     }
2716 }
2717
2718 =head2 set_displayname_count
2719
2720  Title     : set_displayname_count
2721  Usage     : $ali->set_displayname_count
2722  Function  : Sets the names to be name_# where # is the number of
2723              times this name has been used.
2724  Returns   : 1, on success
2725  Argument  :
2726
2727 =cut
2728
2729 sub set_displayname_count {
2730     my $self= shift;
2731     my (@arr,$name,$seq,$count,$temp,$nse);
2732
2733     foreach $seq ( $self->each_alphabetically() ) {
2734         $nse = $seq->get_nse();
2735
2736         #name will be set when this is the second
2737         #time (or greater) is has been seen
2738
2739         if( defined $name and $name eq ($seq->id()) ) {
2740             $temp = sprintf("%s_%s",$name,$count);
2741             $self->displayname($nse,$temp);
2742             $count++;
2743         } else {
2744             $count = 1;
2745             $name = $seq->id();
2746             $temp = sprintf("%s_%s",$name,$count);
2747             $self->displayname($nse,$temp);
2748             $count++;
2749         }
2750     }
2751     return 1;
2752 }
2753
2754 =head2 set_displayname_flat
2755
2756  Title     : set_displayname_flat
2757  Usage     : $ali->set_displayname_flat()
2758  Function  : Makes all the sequences be displayed as just their name,
2759              not name/start-end
2760  Returns   : 1
2761  Argument  :
2762
2763 =cut
2764
2765 sub set_displayname_flat {
2766     my $self = shift;
2767     my ($nse,$seq);
2768
2769     foreach $seq ( $self->each_seq() ) {
2770         $nse = $seq->get_nse();
2771         $self->displayname($nse,$seq->id());
2772     }
2773     return 1;
2774 }
2775
2776 =head2 set_displayname_normal
2777
2778  Title     : set_displayname_normal
2779  Usage     : $ali->set_displayname_normal()
2780  Function  : Makes all the sequences be displayed as name/start-end
2781  Returns   : 1, on success
2782  Argument  :
2783
2784 =cut
2785
2786 sub set_displayname_normal {
2787     my $self = shift;
2788     my ($nse,$seq);
2789
2790     foreach $seq ( $self->each_seq() ) {
2791         $nse = $seq->get_nse();
2792         $self->displayname($nse,$nse);
2793     }
2794     return 1;
2795 }
2796
2797 =head2 source
2798
2799  Title   : source
2800  Usage   : $obj->source($newval)
2801  Function: sets the Alignment source program
2802  Example :
2803  Returns : value of source
2804  Args    : newvalue (optional)
2805
2806
2807 =cut
2808
2809 sub source{
2810    my ($self,$value) = @_;
2811    if( defined $value) {
2812       $self->{'_source'} = $value;
2813     }
2814     return $self->{'_source'};
2815 }
2816
2817 =head2 set_displayname_safe
2818
2819  Title     : set_displayname_safe
2820  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2821  Function  : Assign machine-generated serial names to sequences in input order.
2822              Designed to protect names during PHYLIP runs. Assign 10-char string
2823              in the form of "S000000001" to "S999999999". Restore the original
2824              names using "restore_displayname".
2825  Returns   : 1. a new $aln with system names;
2826              2. a hash ref for restoring names
2827  Argument  : Number for id length (default 10)
2828
2829 =cut
2830
2831 sub set_displayname_safe {
2832     my $self = shift;
2833     my $idlength = shift || 10;
2834     my ($seq, %phylip_name);
2835     my $ct=0;
2836     my $new=Bio::SimpleAlign->new();
2837     foreach $seq ( $self->each_seq() ) {
2838         $ct++;
2839         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2840         $phylip_name{$pname}=$seq->id();
2841         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2842                                             -seq      => $seq->seq(),
2843                                             -alphabet => $seq->alphabet,
2844                                             -start    => $seq->{_start},
2845                                             -end      => $seq->{_end}
2846                                             );
2847         $new->add_seq($new_seq);
2848     }
2849
2850     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2851     return ($new, \%phylip_name);
2852 }
2853
2854 =head2 restore_displayname
2855
2856  Title     : restore_displayname
2857  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2858  Function  : Restore original sequence names (after running
2859              $ali->set_displayname_safe)
2860  Returns   : a new $aln with names restored.
2861  Argument  : a hash reference of names from "set_displayname_safe".
2862
2863 =cut
2864
2865 sub restore_displayname {
2866     my $self = shift;
2867     my $ref=shift;
2868     my %name=%$ref;
2869     my $new=Bio::SimpleAlign->new();
2870     foreach my $seq ( $self->each_seq() ) {
2871       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2872       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2873                                           -seq      => $seq->seq(),
2874                                           -alphabet => $seq->alphabet,
2875                                           -start    => $seq->{_start},
2876                                           -end      => $seq->{_end}
2877                                           );
2878       $new->add_seq($new_seq);
2879     }
2880     return $new;
2881 }
2882
2883 =head2 sort_by_start
2884
2885  Title     : sort_by_start
2886  Usage     : $ali->sort_by_start
2887  Function  : Changes the order of the alignment to the start position of each
2888              subalignment
2889  Returns   :
2890  Argument  :
2891
2892 =cut
2893
2894 sub sort_by_start {
2895     my $self = shift;
2896     my ($seq,$nse,@arr,%hash,$count);
2897     foreach $seq ( $self->each_seq() ) {
2898         $nse = $seq->get_nse;
2899         $hash{$nse} = $seq;
2900     }
2901     $count = 0;
2902     %{$self->{'_order'}} = (); # reset the hash;
2903     foreach $nse ( sort _startend keys %hash) {
2904         $self->{'_order'}->{$count} = $nse;
2905         $count++;
2906     }
2907     1;
2908 }
2909
2910 sub _startend
2911 {
2912     my ($aname,$arange) = split (/[\/]/,$a);
2913     my ($bname,$brange) = split (/[\/]/,$b);
2914     my ($astart,$aend) = split(/\-/,$arange);
2915     my ($bstart,$bend) = split(/\-/,$brange);
2916     return $astart <=> $bstart;
2917 }
2918
2919 =head2 bracket_string
2920
2921  Title     : bracket_string
2922  Usage     : my @params = (-refseq     => 'testseq',
2923                            -allele1    => 'allele1',
2924                            -allele2    => 'allele2',
2925                            -delimiters => '{}',
2926                            -separator  => '/');
2927              $str = $aln->bracket_string(@params)
2928
2929  Function :  When supplied with a list of parameters (see below), returns a
2930              string in BIC format. This is used for allelic comparisons.
2931              Briefly, if either allele contains a base change when compared to
2932              the refseq, the base or gap for each allele is represented in
2933              brackets in the order present in the 'alleles' parameter.
2934
2935              For the following data:
2936
2937              >testseq
2938              GGATCCATTGCTACT
2939              >allele1
2940              GGATCCATTCCTACT
2941              >allele2
2942              GGAT--ATTCCTCCT
2943
2944              the returned string with parameters 'refseq => testseq' and
2945              'alleles => [qw(allele1 allele2)]' would be:
2946
2947              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2948  Returns   : BIC-formatted string
2949  Argument  : Required args
2950                 refseq    : string (ID) of the reference sequence used
2951                             as basis for comparison
2952                 allele1   : string (ID) of the first allele
2953                 allele2   : string (ID) of the second allele
2954              Optional args
2955                 delimiters: two symbol string of left and right delimiters.
2956                             Only the first two symbols are used
2957                             default = '[]'
2958                 separator : string used as a separator.  Only the first
2959                             symbol is used
2960                             default = '/'
2961  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2962
2963 =cut
2964
2965 sub bracket_string {
2966     my ($self, @args) = @_;
2967     my ($ref, $a1, $a2, $delim, $sep) =
2968         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2969     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2970     my ($ld, $rd);
2971     ($ld, $rd) = split('', $delim, 2) if $delim;
2972     $ld ||= '[';
2973     $rd ||= ']';
2974     $sep ||= '/';
2975     my ($refseq, $allele1, $allele2) =
2976         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2977     if (!$refseq || !$allele1 || !$allele2) {
2978         $self->throw("One of your refseq/allele IDs is invalid!");
2979     }
2980     my $len = $self->length-1;
2981     my $bic = '';
2982     # loop over the alignment columns
2983     for my $column ( 0 .. $len ) {
2984         my $string;
2985         my ($compres, $res1, $res2) =
2986             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2987         # are any of the allele symbols different from the refseq?
2988         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2989                 $ld.$res1.$sep.$res2.$rd;
2990         $bic .= $string;
2991     }
2992     return $bic;
2993 }
2994
2995
2996 =head2 methods implementing Bio::FeatureHolderI
2997
2998 FeatureHolderI implementation to support labeled character sets like one
2999 would get from NEXUS represented data.
3000
3001 =head2 get_SeqFeatures
3002
3003  Usage   : @features = $aln->get_SeqFeatures
3004  Function: Get the feature objects held by this feature holder.
3005  Example :
3006  Returns : an array of Bio::SeqFeatureI implementing objects
3007  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3008          : as argument, returning TRUE if wanted, FALSE if
3009          : unwanted
3010
3011 =cut
3012
3013 sub get_SeqFeatures {
3014     my $self = shift;
3015     my $filter_cb = shift;
3016     $self->throw("Arg (filter callback) must be a coderef") unless
3017         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
3018     if( !defined $self->{'_as_feat'} ) {
3019         $self->{'_as_feat'} = [];
3020     }
3021     if ($filter_cb) {
3022         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
3023     }
3024     return @{$self->{'_as_feat'}};
3025 }
3026
3027 =head2 add_SeqFeature
3028
3029  Usage   : $aln->add_SeqFeature($subfeat);
3030  Function: adds a SeqFeature into the SeqFeature array.
3031  Example :
3032  Returns : true on success
3033  Args    : a Bio::SeqFeatureI object
3034  Note    : This implementation is not compliant
3035            with Bio::FeatureHolderI
3036
3037 =cut
3038
3039 sub add_SeqFeature {
3040    my ($self,@feat) = @_;
3041
3042    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3043
3044    foreach my $feat ( @feat ) {
3045        if( !$feat->isa("Bio::SeqFeatureI") ) {
3046            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3047        }
3048
3049        push(@{$self->{'_as_feat'}},$feat);
3050    }
3051    return 1;
3052 }
3053
3054
3055 =head2 remove_SeqFeatures
3056
3057  Usage   : $obj->remove_SeqFeatures
3058  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3059            remove that subset from the returned array, and add back the rest.
3060  Returns : The array of Bio::SeqFeatureI features that was
3061            deleted from this alignment.
3062  Args    : none
3063
3064 =cut
3065
3066 sub remove_SeqFeatures {
3067     my $self = shift;
3068
3069     return () unless $self->{'_as_feat'};
3070     my @feats = @{$self->{'_as_feat'}};
3071     $self->{'_as_feat'} = [];
3072     return @feats;
3073 }
3074
3075 =head2 feature_count
3076
3077  Title   : feature_count
3078  Usage   : $obj->feature_count()
3079  Function: Return the number of SeqFeatures attached to the alignment
3080  Returns : integer representing the number of SeqFeatures
3081  Args    : None
3082
3083 =cut
3084
3085 sub feature_count {
3086     my ($self) = @_;
3087
3088     if (defined($self->{'_as_feat'})) {
3089         return ($#{$self->{'_as_feat'}} + 1);
3090     } else {
3091         return 0;
3092     }
3093 }
3094
3095 =head2 get_all_SeqFeatures
3096
3097  Title   : get_all_SeqFeatures
3098  Usage   :
3099  Function: Get all SeqFeatures.
3100  Example :
3101  Returns : an array of Bio::SeqFeatureI implementing objects
3102  Args    : none
3103  Note    : Falls through to Bio::FeatureHolderI implementation.
3104
3105 =cut
3106
3107 =head2 methods for Bio::AnnotatableI
3108
3109 AnnotatableI implementation to support sequence alignments which
3110 contain annotation (NEXUS, Stockholm).
3111
3112 =head2 annotation
3113
3114  Title   : annotation
3115  Usage   : $ann = $aln->annotation or
3116            $aln->annotation($ann)
3117  Function: Gets or sets the annotation
3118  Returns : Bio::AnnotationCollectionI object
3119  Args    : None or Bio::AnnotationCollectionI object
3120
3121 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3122 for more information
3123
3124 =cut
3125
3126 sub annotation {
3127     my ($obj,$value) = @_;
3128     if( defined $value ) {
3129         $obj->throw("object of class ".ref($value)." does not implement ".
3130                 "Bio::AnnotationCollectionI. Too bad.")
3131             unless $value->isa("Bio::AnnotationCollectionI");
3132         $obj->{'_annotation'} = $value;
3133     } elsif( ! defined $obj->{'_annotation'}) {
3134         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3135     }
3136     return $obj->{'_annotation'};
3137 }
3138
3139 =head1 Deprecated methods
3140
3141 =cut
3142
3143 =head2 no_residues
3144
3145  Title     : no_residues
3146  Usage     : $no = $ali->no_residues
3147  Function  : number of residues in total in the alignment
3148  Returns   : integer
3149  Argument  :
3150  Note      : deprecated in favor of num_residues()
3151
3152 =cut
3153
3154 sub no_residues {
3155         my $self = shift;
3156         $self->deprecated(-warn_version => 1.0069,
3157                                           -throw_version => 1.0075,
3158                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3159     $self->num_residues(@_);
3160 }
3161
3162 =head2 no_sequences
3163
3164  Title     : no_sequences
3165  Usage     : $depth = $ali->no_sequences
3166  Function  : number of sequence in the sequence alignment
3167  Returns   : integer
3168  Argument  :
3169  Note      : deprecated in favor of num_sequences()
3170
3171 =cut
3172
3173 sub no_sequences {
3174         my $self = shift;
3175         $self->deprecated(-warn_version => 1.0069,
3176                                           -throw_version => 1.0075,
3177                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3178     $self->num_sequences(@_);
3179 }
3180
3181 =head2 mask_columns
3182
3183  Title     : mask_columns
3184  Usage     : $aln2 = $aln->mask_columns(20,30)
3185  Function  : Masks a slice of the alignment inclusive of start and
3186              end columns, and the first column in the alignment is denoted 1.
3187              Mask beyond the length of the sequence does not do padding.
3188  Returns   : A Bio::SimpleAlign object
3189  Args      : Positive integer for start column, positive integer for end column,
3190              optional string value use for the mask. Example:
3191
3192              $aln2 = $aln->mask_columns(20,30,'?')
3193  Note      : Masking must use a character that is not used for gaps or
3194              frameshifts.  These can be adjusted using the relevant global
3195              variables, but be aware these may be (uncontrollably) modified
3196              elsewhere within BioPerl (see bug 2715)
3197
3198 =cut
3199
3200 sub mask_columns {
3201     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3202     my $self = shift;
3203
3204     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3205              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3206
3207     # coordinates are alignment-based, not sequence-based
3208     my ($start, $end, $mask_char) = @_;
3209     unless (defined $mask_char) { $mask_char = 'N' }
3210
3211     $self->throw("Mask start has to be a positive integer and less than ".
3212                  "alignment length, not [$start]")
3213       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3214     $self->throw("Mask end has to be a positive integer and less than ".
3215                  "alignment length, not [$end]")
3216       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3217     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3218                  "end [$end]") unless $start <= $end;
3219     $self->throw("Mask character $mask_char has to be a single character ".
3220                  "and not a gap or frameshift symbol")
3221       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3222
3223     my $aln = $self->new;
3224     $aln->id($self->id);
3225     foreach my $seq ( $self->each_seq() ) {
3226         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3227          -alphabet => $seq->alphabet,
3228          -strand  => $seq->strand,
3229          -verbose => $self->verbose);
3230
3231         # convert from 1-based alignment coords!
3232         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3233         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3234         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3235         $new_seq->seq($new_dna_string);
3236         $aln->add_seq($new_seq);
3237     }
3238     return $aln;
3239 }
3240
3241
3242
3243 1;