Bio/SimpleAlign.pm

   1 # BioPerl module for SimpleAlign
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12 #
  13 #  History:
  14 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  15 #       May 2001 major rewrite - Heikki Lehvaslaiho
  16
  17 =head1 NAME
  18
  19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  20
  21 =head1 SYNOPSIS
  22
  23   # Use Bio::AlignIO to read in the alignment
  24   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  25   $aln = $str->next_aln();
  26
  27   # Describe
  28   print $aln->length;
  29   print $aln->num_residues;
  30   print $aln->is_flush;
  31   print $aln->num_sequences;
  32   print $aln->score;
  33   print $aln->percentage_identity;
  34   print $aln->consensus_string(50);
  35
  36   # Find the position in the alignment for a sequence location
  37   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  38
  39   # Extract sequences and check values for the alignment column $pos
  40   foreach $seq ($aln->each_seq) {
  41       $res = $seq->subseq($pos, $pos);
  42       $count{$res}++;
  43   }
  44   foreach $res (keys %count) {
  45       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  46   }
  47
  48   # Manipulate
  49   $aln->remove_seq($seq);
  50   $mini_aln = $aln->slice(20,30);  # get a block of columns
  51   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  52   $new_aln = $aln->remove_columns([20,30]); # remove by position
  53   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  54
  55   # Analyze
  56   $str = $aln->consensus_string($threshold_percent);
  57   $str = $aln->match_line();
  58   $str = $aln->cigar_line();
  59   $id = $aln->percentage_identity;
  60
  61   # See the module documentation for details and more methods.
  62
  63 =head1 DESCRIPTION
  64
  65 SimpleAlign is an object that handles a multiple sequence alignment
  66 (MSA). It is very permissive of types (it does not insist on sequences
  67 being all same length, for example). Think of it as a set of sequences
  68 with a whole series of built-in manipulations and methods for reading and
  69 writing alignments.
  70
  71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  72 to store its sequences. These are subsequences with a start and end
  73 positions in the parent reference sequence. Each sequence in the
  74 SimpleAlign object is a Bio::LocatableSeq.
  75
  76 SimpleAlign expects the combination of name, start, and end for a
  77 given sequence to be unique in the alignment, and this is the key for the
  78 internal hashes (name, start, end are abbreviated C<nse> in the code).
  79 However, in some cases people do not want the name/start-end to be displayed:
  80 either multiple names in an alignment or names specific to the alignment
  81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  82 C<displayname>, and generally is what is used to print out the
  83 alignment. They default to name/start-end.
  84
  85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules.  Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 I<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution. Bug reports can be submitted via the
 113 web:
 114
 115   https://redmine.open-bio.org/projects/bioperl/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@ebi.ac.uk
 120
 121 =head1 CONTRIBUTORS
 122
 123 Allen Day, allenday-at-ucla.edu,
 124 Richard Adams, Richard.Adams-at-ed.ac.uk,
 125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 127 Allen Smith, allens-at-cpan.org,
 128 Jason Stajich, jason-at-bioperl.org,
 129 Anthony Underwood, aunderwood-at-phls.org.uk,
 130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 131 Brian Osborne, bosborne at alum.mit.edu
 132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 133 Hongyu Zhang, forward at hongyu.org
 134 Jay Hannah, jay at jays.net
 135 Alexandr Bezginov, albezg at gmail.com
 136
 137 =head1 SEE ALSO
 138
 139 L<Bio::LocatableSeq>
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148 # 'Let the code begin...
 149
 150 package Bio::SimpleAlign;
 151 use vars qw(%CONSERVATION_GROUPS);
 152 use strict;
 153
 154 use Bio::LocatableSeq;  # uses Seq's as list
 155
 156 use Bio::Seq;
 157 use Bio::SeqFeature::Generic;
 158
 159 BEGIN {
 160     # This data should probably be in a more centralized module...
 161     # it is taken from Clustalw documentation.
 162     # These are all the positively scoring groups that occur in the
 163     # Gonnet Pam250 matrix. The strong and weak groups are
 164     # defined as strong score >0.5 and weak score =<0.5 respectively.
 165
 166     %CONSERVATION_GROUPS = (
 167             'strong' => [ qw(
 168                                                  STA
 169                                                  NEQK
 170                                                  NHQK
 171                                                  NDEQ
 172                                                  QHRK
 173                                                  MILV
 174                                                  MILF
 175                                                  HY
 176                                                  FYW )],
 177                                 'weak' => [ qw(
 178                       CSA
 179                                                ATV
 180                                                SAG
 181                                                STNK
 182                                                STPA
 183                                                SGND
 184                                                SNDEQK
 185                                                NDEQHK
 186                                                NEQHRK
 187                                                FVLIM
 188                                                HFY )],);
 189 }
 190
 191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 192             Bio::FeatureHolderI);
 193
 194 =head2 new
 195
 196  Title     : new
 197  Usage     : my $aln = Bio::SimpleAlign->new();
 198  Function  : Creates a new simple align object
 199  Returns   : Bio::SimpleAlign
 200  Args      : -source     => string representing the source program
 201                             where this alignment came from
 202              -annotation => Bio::AnnotationCollectionI
 203              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 204              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 205              -consensus  => consensus string
 206              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 207
 208 =cut
 209
 210
 211 sub new {
 212   my($class,@args) = @_;
 213
 214   my $self = $class->SUPER::new(@args);
 215
 216   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 217                                             SOURCE
 218                                             SCORE
 219                                             ID
 220                                             ACCESSION
 221                                             DESCRIPTION
 222                                             SEQS
 223                                             FEATURES
 224                                             ANNOTATION
 225                                             SEQ_ANNOTATION
 226                                             CONSENSUS
 227                                             CONSENSUS_META
 228                                             )], @args);
 229   $src && $self->source($src);
 230   defined $score && $self->score($score);
 231   # we need to set up internal hashs first!
 232
 233   $self->{'_seq'} = {};
 234   $self->{'_order'} = {};
 235   $self->{'_start_end_lists'} = {};
 236   $self->{'_dis_name'} = {};
 237   $self->{'_id'} = 'NoName';
 238   # maybe we should automatically read in from args. Hmmm...
 239   $id  && $self->id($id);
 240   $acc && $self->accession($acc);
 241   $desc && $self->description($desc);
 242   $coll && $self->annotation($coll);
 243   # sequence annotation is layered into a provided annotation collection (or dies)
 244   if ($sa) {
 245     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 246                  "with a sequence annotation collection")
 247         if !$coll;
 248     $coll->add_Annotation('seq_annotation', $sa);
 249   }
 250   if ($feats && ref $feats eq 'ARRAY') {
 251     for my $feat (@$feats) {
 252         $self->add_SeqFeature($feat);
 253     }
 254   }
 255   $con && $self->consensus($con);
 256   $cmeta && $self->consensus_meta($cmeta);
 257   # assumes these are in correct alignment order
 258   if ($seqs && ref($seqs) eq 'ARRAY') {
 259     for my $seq (@$seqs) {
 260         $self->add_seq($seq);
 261     }
 262   }
 263
 264   return $self; # success - we hope!
 265 }
 266
 267 =head1 Modifier methods
 268
 269 These methods modify the MSA by adding, removing or shuffling complete
 270 sequences.
 271
 272 =head2 add_seq
 273
 274  Title     : add_seq
 275  Usage     : $myalign->add_seq($newseq);
 276              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 277  Function  : Adds another sequence to the alignment. *Does not* align
 278              it - just adds it to the hashes.
 279              If -ORDER is specified, the sequence is inserted at the
 280              the position spec'd by -ORDER, and existing sequences
 281              are pushed down the storage array.
 282  Returns   : nothing
 283  Args      : A Bio::LocatableSeq object
 284              Positive integer for the sequence position (optional)
 285
 286 See L<Bio::LocatableSeq> for more information
 287
 288 =cut
 289
 290 sub addSeq {
 291     my $self = shift;
 292     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 293     $self->add_seq(@_);
 294 }
 295
 296 sub add_seq {
 297     my $self = shift;
 298     my @args = @_;
 299     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 300     my ($name,$id,$start,$end);
 301
 302     unless ($seq) {
 303         $self->throw("LocatableSeq argument required");
 304     }
 305     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 306         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 307     }
 308     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 309     $order--; # jay's patch (user-specified order is 1-origin)
 310
 311     if ($order < 0) {
 312         $self->throw("User-specified value for ORDER must be >= 1");
 313     }
 314
 315     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 316
 317     # build the symbol list for this sequence,
 318     # will prune out the gap and missing/match chars
 319     # when actually asked for the symbol list in the
 320     # symbol_chars
 321     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 322
 323     $name = $seq->get_nse;
 324
 325     if( $self->{'_seq'}->{$name} ) {
 326         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 327     }
 328     else {
 329         $self->debug( "Assigning $name to $order\n");
 330
 331     my $ordh = $self->{'_order'};
 332     if ($ordh->{$order}) {
 333         # make space to insert
 334         # $c->() returns (in reverse order) the first subsequence
 335         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 336         # (3,2,1), and $c->(2,4,5) returns (2).
 337         my $c;
 338         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 339         map {
 340      $ordh->{$_+1} = $ordh->{$_}
 341         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 342
 343     }
 344     $ordh->{$order} = $name;
 345
 346         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 347             $self->{'_start_end_lists'}->{$id} = [];
 348         }
 349         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 350     }
 351
 352     $self->{'_seq'}->{$name} = $seq;
 353
 354 }
 355
 356
 357 =head2 remove_seq
 358
 359  Title     : remove_seq
 360  Usage     : $aln->remove_seq($seq);
 361  Function  : Removes a single sequence from an alignment
 362  Returns   :
 363  Argument  : a Bio::LocatableSeq object
 364
 365 =cut
 366
 367 sub removeSeq {
 368     my $self = shift;
 369     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 370     $self->remove_seq(@_);
 371 }
 372
 373 sub remove_seq {
 374     my $self = shift;
 375     my $seq = shift;
 376     my ($name,$id);
 377
 378     $self->throw("Need Bio::Locatable seq argument ")
 379         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 380
 381     $id = $seq->id();
 382     $name = $seq->get_nse;
 383
 384     if( !exists $self->{'_seq'}->{$name} ) {
 385         $self->throw("Sequence $name does not exist in the alignment to remove!");
 386     }
 387
 388     delete $self->{'_seq'}->{$name};
 389
 390     # we need to remove this seq from the start_end_lists hash
 391
 392     if (exists $self->{'_start_end_lists'}->{$id}) {
 393         # we need to find the sequence in the array.
 394
 395         my ($i, $found);;
 396         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 397             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 398                 $found = 1;
 399                 last;
 400             }
 401         }
 402         if ($found) {
 403             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 404         }
 405         else {
 406             $self->throw("Could not find the sequence to remoce from the start-end list");
 407         }
 408     }
 409     else {
 410         $self->throw("There is no seq list for the name $id");
 411     }
 412     # we need to shift order hash
 413     my %rev_order = reverse %{$self->{'_order'}};
 414     my $no = $rev_order{$name};
 415     my $num_sequences = $self->num_sequences;
 416     for (; $no < $num_sequences; $no++) {
 417        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 418     }
 419     delete $self->{'_order'}->{$no};
 420     return 1;
 421 }
 422
 423
 424 =head2 purge
 425
 426  Title   : purge
 427  Usage   : $aln->purge(0.7);
 428  Function: Removes sequences above given sequence similarity
 429            This function will grind on large alignments. Beware!
 430  Example :
 431  Returns : An array of the removed sequences
 432  Args    : float, threshold for similarity
 433
 434 =cut
 435
 436 sub purge {
 437         my ($self,$perc) = @_;
 438         my (%duplicate, @dups);
 439
 440         my @seqs = $self->each_seq();
 441
 442         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 443                 my $seq = $seqs[$i];
 444
 445                 #skip if already in duplicate hash
 446                 next if exists $duplicate{$seq->display_id} ;
 447                 my $one = $seq->seq();
 448
 449                 my @one = split '', $one;       #split to get 1aa per array element
 450
 451                 for (my $j=$i+1;$j < @seqs;$j++) {
 452                         my $seq2 = $seqs[$j];
 453
 454                         #skip if already in duplicate hash
 455                         next if exists $duplicate{$seq2->display_id} ;
 456
 457                         my $two = $seq2->seq();
 458                         my @two = split '', $two;
 459
 460                         my $count = 0;
 461                         my $res = 0;
 462                         for (my $k=0;$k<@one;$k++) {
 463                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 464                                           $one[$k] eq $two[$k]) {
 465                                         $count++;
 466                                 }
 467                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 468                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 469                                         $res++;
 470                                 }
 471                         }
 472
 473                         my $ratio = 0;
 474                         $ratio = $count/$res unless $res == 0;
 475
 476                         # if above threshold put in duplicate hash and push onto
 477                         # duplicate array for returning to get_unique
 478                         if ( $ratio > $perc ) {
 479                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 480                                 $duplicate{$seq2->display_id} = 1;
 481                                 push @dups, $seq2;
 482                         }
 483                 }
 484         }
 485         foreach my $seq (@dups) {
 486                 $self->remove_seq($seq);
 487         }
 488         return @dups;
 489 }
 490
 491 =head2 sort_alphabetically
 492
 493  Title     : sort_alphabetically
 494  Usage     : $ali->sort_alphabetically
 495  Function  : Changes the order of the alignment to alphabetical on name
 496              followed by numerical by number.
 497  Returns   :
 498  Argument  :
 499
 500 =cut
 501
 502 sub sort_alphabetically {
 503     my $self = shift;
 504     my ($seq,$nse,@arr,%hash,$count);
 505
 506     foreach $seq ( $self->each_seq() ) {
 507         $nse = $seq->get_nse;
 508         $hash{$nse} = $seq;
 509     }
 510
 511     $count = 0;
 512
 513     %{$self->{'_order'}} = (); # reset the hash;
 514
 515     foreach $nse ( sort _alpha_startend keys %hash) {
 516         $self->{'_order'}->{$count} = $nse;
 517
 518         $count++;
 519     }
 520     1;
 521 }
 522
 523 =head2 sort_by_list
 524
 525  Title     : sort_by_list
 526  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 527  Function  : Arbitrarily order sequences in an alignment
 528  Returns   : A new Bio::SimpleAlign object
 529  Argument  : a file listing sequence names in intended order (one name per line)
 530
 531 =cut
 532
 533 sub sort_by_list {
 534     my ($self, $list) = @_;
 535     my (@seq, @ids, %order);
 536
 537     foreach my $seq ( $self->each_seq() ) {
 538         push @seq, $seq;
 539         push @ids, $seq->display_id;
 540     }
 541
 542     my $ct=1;
 543     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 544     while (<$listfh>) {
 545       chomp;
 546       my $name=$_;
 547       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 548       $order{$name}=$ct++;
 549     }
 550     close($listfh);
 551
 552     # use the map-sort-map idiom:
 553     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 554     my $aln = $self->new;
 555     foreach (@sorted) { $aln->add_seq($_) }
 556     return $aln;
 557 }
 558
 559 =head2 set_new_reference
 560
 561  Title     : set_new_reference
 562  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 563              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 564              as the reference (1st) sequence
 565  Function  : Change/Set a new reference (i.e., the first) sequence
 566  Returns   : a new Bio::SimpleAlign object.
 567              Throws an exception if designated sequence not found
 568  Argument  : a positive integer of sequence order, or a sequence name
 569              in the original alignment
 570
 571 =cut
 572
 573 sub set_new_reference {
 574     my ($self, $seqid) = @_;
 575     my $aln = $self->new;
 576     my (@seq, @ids, @new_seq);
 577     my $is_num=0;
 578     foreach my $seq ( $self->each_seq() ) {
 579         push @seq, $seq;
 580         push @ids, $seq->display_id;
 581     }
 582
 583     if ($seqid =~ /^\d+$/) { # argument is seq position
 584         $is_num=1;
 585         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 586     } else { # argument is a seq name
 587         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 588     }
 589
 590     for (my $i=0; $i<=$#seq; $i++) {
 591         my $pos=$i+1;
 592         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 593             unshift @new_seq, $seq[$i];
 594         } else {
 595             push @new_seq, $seq[$i];
 596         }
 597     }
 598     foreach (@new_seq) { $aln->add_seq($_);  }
 599     return $aln;
 600 }
 601
 602 sub _in_aln {  # check if input name exists in the alignment
 603     my ($str, $ref) = @_;
 604     foreach (@$ref) {
 605         return 1 if $str eq $_;
 606     }
 607     return 0;
 608 }
 609
 610
 611 =head2 uniq_seq
 612
 613  Title     : uniq_seq
 614  Usage     : $aln->uniq_seq():  Remove identical sequences in
 615              in the alignment.  Ambiguous base ("N", "n") and
 616              leading and ending gaps ("-") are NOT counted as
 617              differences.
 618  Function  : Make a new alignment of unique sequence types (STs)
 619  Returns   : 1a. if called in a scalar context,
 620                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 621              1b. if called in an array context,
 622                 a new Bio::SimpleAlign object, and a hashref whose keys
 623                 are sequence types, and whose values are arrayrefs to
 624                 lists of sequence ids within the corresponding sequence type
 625              2. if $aln->verbose > 0, ST of each sequence is sent to
 626                 STDERR (in a tabular format)
 627  Argument  : None
 628
 629 =cut
 630
 631 sub uniq_seq {
 632     my ($self, $seqid) = @_;
 633     my $aln = $self->new;
 634     my (%member, %order, @seq, @uniq_str, $st);
 635     my $order=0;
 636     my $len = $self->length();
 637     $st = {};
 638     foreach my $seq ( $self->each_seq() ) {
 639         my $str = $seq->seq();
 640
 641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 642 # comparing two sequence strings
 643
 644     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 645         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 646     # 2nd, convert leading and ending gaps to "?":
 647         $str = &_convert_leading_ending_gaps($str, '-', '?');
 648     # Note that '?' also can mean unknown residue.
 649     # I don't like making global class member changes like this, too
 650     # prone to errors... -- cjfields 08-11-18
 651     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 652         my $new = Bio::LocatableSeq->new(
 653                      -id      => $seq->id(),
 654                                          -alphabet=> $seq->alphabet,
 655                                          -seq     => $str,
 656                                          -start   => $seq->start,
 657                                          -end     => $seq->end
 658                                          );
 659         push @seq, $new;
 660     }
 661
 662     foreach my $seq (@seq) {
 663         my $str = $seq->seq();
 664         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 665         if ($seen) { # seen before
 666             my @memb = @{$member{$key}};
 667             push @memb, $seq;
 668             $member{$key} = \@memb;
 669         } else {  # not seen
 670             push @uniq_str, $key;
 671             $order++;
 672             $member{$key} = [ ($seq) ];
 673             $order{$key} = $order;
 674         }
 675     }
 676
 677     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 679         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 681         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 682         my $gap='-';
 683         my $end= CORE::length($str2);
 684         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 685         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 686                                          -seq  =>$str2,
 687                                          -start=>1,
 688                                          -end  =>$end
 689                                          );
 690         $aln->add_seq($new);
 691         foreach (@{$member{$str}}) {
 692             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 693         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 694         }
 695     }
 696     return wantarray ? ($aln, $st) : $aln;
 697 }
 698
 699 sub _check_uniq {  # check if same seq exists in the alignment
 700     my ($str1, $ref, $length) = @_;
 701     my @char1=split //, $str1;
 702     my @array=@$ref;
 703
 704     return (0, $str1) if @array==0; # not seen (1st sequence)
 705
 706     foreach my $str2 (@array) {
 707         my $diff=0;
 708         my @char2=split //, $str2;
 709         for (my $i=0; $i<=$length-1; $i++) {
 710             next if $char1[$i] eq '?';
 711             next if $char2[$i] eq '?';
 712             $diff++ if $char1[$i] ne $char2[$i];
 713         }
 714         return (1, $str2) if $diff == 0;  # seen before
 715     }
 716
 717     return (0, $str1); # not seen
 718 }
 719
 720 sub _convert_leading_ending_gaps {
 721     my $s=shift;
 722     my $sym1=shift;
 723     my $sym2=shift;
 724     my @array=split //, $s;
 725 # convert leading char:
 726     for (my $i=0; $i<=$#array; $i++) {
 727         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 728     }
 729 # convert ending char:
 730     for (my $i = $#array; $i>= 0; $i--) {
 731         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 732     }
 733     my $s_new=join '', @array;
 734     return $s_new;
 735 }
 736
 737 =head1 Sequence selection methods
 738
 739 Methods returning one or more sequences objects.
 740
 741 =head2 each_seq
 742
 743  Title     : each_seq
 744  Usage     : foreach $seq ( $align->each_seq() )
 745  Function  : Gets a Seq object from the alignment
 746  Returns   : Seq object
 747  Argument  :
 748
 749 =cut
 750
 751 sub eachSeq {
 752     my $self = shift;
 753     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 754     $self->each_seq();
 755 }
 756
 757 sub each_seq {
 758         my $self = shift;
 759         my (@arr,$order);
 760
 761         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 762                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 763                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 764                 }
 765         }
 766         return @arr;
 767 }
 768
 769
 770 =head2 each_alphabetically
 771
 772  Title     : each_alphabetically
 773  Usage     : foreach $seq ( $ali->each_alphabetically() )
 774  Function  : Returns a sequence object, but the objects are returned
 775              in alphabetically sorted order.
 776              Does not change the order of the alignment.
 777  Returns   : Seq object
 778  Argument  :
 779
 780 =cut
 781
 782 sub each_alphabetically {
 783         my $self = shift;
 784         my ($seq,$nse,@arr,%hash,$count);
 785
 786         foreach $seq ( $self->each_seq() ) {
 787                 $nse = $seq->get_nse;
 788                 $hash{$nse} = $seq;
 789         }
 790
 791         foreach $nse ( sort _alpha_startend keys %hash) {
 792                 push(@arr,$hash{$nse});
 793         }
 794         return @arr;
 795 }
 796
 797 sub _alpha_startend {
 798     my ($aname,$astart,$bname,$bstart);
 799     ($aname,$astart) = split (/-/,$a);
 800     ($bname,$bstart) = split (/-/,$b);
 801
 802     if( $aname eq $bname ) {
 803         return $astart <=> $bstart;
 804     }
 805     else {
 806         return $aname cmp $bname;
 807     }
 808 }
 809
 810 =head2 each_seq_with_id
 811
 812  Title     : each_seq_with_id
 813  Usage     : foreach $seq ( $align->each_seq_with_id() )
 814  Function  : Gets a Seq objects from the alignment, the contents
 815              being those sequences with the given name (there may be
 816              more than one)
 817  Returns   : Seq object
 818  Argument  : a seq name
 819
 820 =cut
 821
 822 sub eachSeqWithId {
 823     my $self = shift;
 824     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 825     $self->each_seq_with_id(@_);
 826 }
 827
 828 sub each_seq_with_id {
 829     my $self = shift;
 830     my $id = shift;
 831
 832     $self->throw("Method each_seq_with_id needs a sequence name argument")
 833         unless defined $id;
 834
 835     my (@arr, $seq);
 836
 837     if (exists($self->{'_start_end_lists'}->{$id})) {
 838         @arr = @{$self->{'_start_end_lists'}->{$id}};
 839     }
 840     return @arr;
 841 }
 842
 843 =head2 get_seq_by_pos
 844
 845  Title     : get_seq_by_pos
 846  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 847  Function  : Gets a sequence based on its position in the alignment.
 848              Numbering starts from 1.  Sequence positions larger than
 849              num_sequences() will throw an error.
 850  Returns   : a Bio::LocatableSeq object
 851  Args      : positive integer for the sequence position
 852
 853 =cut
 854
 855 sub get_seq_by_pos {
 856
 857     my $self = shift;
 858     my ($pos) = @_;
 859
 860     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 861         unless $pos =~ /^\d+$/ and $pos > 0;
 862     $self->throw("No sequence at position [$pos]")
 863         unless $pos <= $self->num_sequences ;
 864
 865     my $nse = $self->{'_order'}->{--$pos};
 866     return $self->{'_seq'}->{$nse};
 867 }
 868
 869 =head2 get_seq_by_id
 870
 871  Title     : get_seq_by_id
 872  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 873  Function  : Gets a sequence based on its name.
 874              Sequences that do not exist will warn and return undef
 875  Returns   : a Bio::LocatableSeq object
 876  Args      : string for sequence name
 877
 878 =cut
 879
 880 sub get_seq_by_id {
 881     my ($self,$name) = @_;
 882     unless( defined $name ) {
 883       $self->warn("Must provide a sequence name");
 884       return;
 885     }
 886     for my $seq ( values %{$self->{'_seq'}} ) {
 887       if ( $seq->id eq $name) {
 888         return $seq;
 889       }
 890     }
 891     return;
 892 }
 893
 894 =head2 seq_with_features
 895
 896  Title   : seq_with_features
 897  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 898                                           -consensus => 60
 899                                           -mask =>
 900            sub { my $consensus = shift;
 901
 902                  for my $i (1..5){
 903                     my $n = 'N' x $i;
 904                     my $q = '\?' x $i;
 905                     while($consensus =~ /[^?]$q[^?]/){
 906                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 907                     }
 908                   }
 909                  return $consensus;
 910                }
 911                                          );
 912  Function: produces a Bio::Seq object by first splicing gaps from -pos
 913            (by means of a splice_by_seq_pos() call), then creating
 914            features using non-? chars (by means of a consensus_string()
 915            call with stringency -consensus).
 916  Returns : a Bio::Seq object
 917  Args    : -pos : required. sequence from which to build the Bio::Seq
 918              object
 919            -consensus : optional, defaults to consensus_string()'s
 920              default cutoff value
 921            -mask : optional, a coderef to apply to consensus_string()'s
 922              output before building features.  this may be useful for
 923              closing gaps of 1 bp by masking over them with N, for
 924              instance
 925
 926 =cut
 927
 928 sub seq_with_features{
 929    my ($self,%arg) = @_;
 930
 931    #first do the preparatory splice
 932    $self->throw("must provide a -pos argument") unless $arg{-pos};
 933    $self->splice_by_seq_pos($arg{-pos});
 934
 935    my $consensus_string = $self->consensus_string($arg{-consensus});
 936    $consensus_string = $arg{-mask}->($consensus_string)
 937          if defined($arg{-mask});
 938
 939    my(@bs,@es);
 940
 941    push @bs, 1 if $consensus_string =~ /^[^?]/;
 942
 943    while($consensus_string =~ /\?[^?]/g){
 944          push @bs, pos($consensus_string);
 945    }
 946    while($consensus_string =~ /[^?]\?/g){
 947          push @es, pos($consensus_string);
 948    }
 949
 950    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 951
 952    my $seq = Bio::Seq->new();
 953
 954 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 955 #                -source_tag => 'location',
 956 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 957 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 958 #                                                  );
 959 #   $seq->add_SeqFeature($rootfeature);
 960
 961    while(my $b = shift @bs){
 962          my $e = shift @es;
 963          $seq->add_SeqFeature(
 964        Bio::SeqFeature::Generic->new(
 965          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 966          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 967          -source_tag => $self->source || 'MSA',
 968        )
 969      );
 970    }
 971
 972    return $seq;
 973 }
 974
 975
 976 =head1 Create new alignments
 977
 978 The result of these methods are horizontal or vertical subsets of the
 979 current MSA.
 980
 981 =head2 select
 982
 983  Title     : select
 984  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 985  Function  : Creates a new alignment from a continuous subset of
 986              sequences.  Numbering starts from 1.  Sequence positions
 987              larger than num_sequences() will throw an error.
 988  Returns   : a Bio::SimpleAlign object
 989  Args      : positive integer for the first sequence
 990              positive integer for the last sequence to include (optional)
 991
 992 =cut
 993
 994 sub select {
 995     my $self = shift;
 996     my ($start, $end) = @_;
 997
 998     $self->throw("Select start has to be a positive integer, not [$start]")
 999         unless $start =~ /^\d+$/ and $start > 0;
1000     $self->throw("Select end has to be a positive integer, not [$end]")
1001         unless $end  =~ /^\d+$/ and $end > 0;
1002     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003         unless $start <= $end;
1004
1005     my $aln = $self->new;
1006     foreach my $pos ($start .. $end) {
1007         $aln->add_seq($self->get_seq_by_pos($pos));
1008     }
1009     $aln->id($self->id);
1010     # fix for meta, sf, ann
1011     return $aln;
1012 }
1013
1014 =head2 select_noncont
1015
1016  Title     : select_noncont
1017  Usage     : # 1st and 3rd sequences, sorted
1018              $aln2 = $aln->select_noncont(1, 3)
1019
1020              # 1st and 3rd sequences, sorted (same as first)
1021              $aln2 = $aln->select_noncont(3, 1)
1022
1023              # 1st and 3rd sequences, unsorted
1024              $aln2 = $aln->select_noncont('nosort',3, 1)
1025
1026  Function  : Creates a new alignment from a subset of sequences.  Numbering
1027              starts from 1.  Sequence positions larger than num_sequences() will
1028              throw an error.  Sorts the order added to new alignment by default,
1029              to prevent sorting pass 'nosort' as the first argument in the list.
1030  Returns   : a Bio::SimpleAlign object
1031  Args      : array of integers for the sequences.  If the string 'nosort' is
1032              passed as the first argument, the sequences will not be sorted
1033              in the new alignment but will appear in the order listed.
1034
1035 =cut
1036
1037 sub select_noncont {
1038         my $self = shift;
1039     my $nosort = 0;
1040         my (@pos) = @_;
1041     if ($pos[0] !~ m{^\d+$}) {
1042         my $sortcmd = shift @pos;
1043         if ($sortcmd eq 'nosort') {
1044             $nosort = 1;
1045         } else {
1046             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1047         }
1048     }
1049
1050     my $end = $self->num_sequences;
1051     foreach ( @pos ) {
1052                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1054         }
1055
1056         @pos = sort {$a <=> $b} @pos unless $nosort;
1057
1058         my $aln = $self->new;
1059         foreach my $p (@pos) {
1060                 $aln->add_seq($self->get_seq_by_pos($p));
1061         }
1062         $aln->id($self->id);
1063     # fix for meta, sf, ann
1064         return $aln;
1065 }
1066
1067 =head2 select_noncont_by_name
1068
1069  Title     : select_noncont_by_name
1070  Usage     : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1071  Function  : Creates a new alignment from a subset of sequences which are
1072              selected by name (sequence ID).
1073  Returns   : a Bio::SimpleAlign object
1074  Args      : array of names (i.e., identifiers) for the sequences.
1075
1076 =cut
1077
1078 sub select_noncont_by_name {
1079     my ($self, @names) = @_;
1080
1081     my $aln = $self->new;
1082     foreach my $name (@names) {
1083         $aln->add_seq($self->get_seq_by_id($name));
1084     }
1085     $aln->id($self->id);
1086
1087     return $aln;
1088 }
1089
1090 =head2 slice
1091
1092  Title     : slice
1093  Usage     : $aln2 = $aln->slice(20,30)
1094  Function  : Creates a slice from the alignment inclusive of start and
1095              end columns, and the first column in the alignment is denoted 1.
1096              Sequences with no residues in the slice are excluded from the
1097              new alignment and a warning is printed. Slice beyond the length of
1098              the sequence does not do padding.
1099  Returns   : A Bio::SimpleAlign object
1100  Args      : Positive integer for start column, positive integer for end column,
1101              optional boolean which if true will keep gap-only columns in the newly
1102              created slice. Example:
1103
1104              $aln2 = $aln->slice(20,30,1)
1105
1106 =cut
1107
1108 sub slice {
1109         my $self = shift;
1110         my ($start, $end, $keep_gap_only) = @_;
1111
1112         $self->throw("Slice start has to be a positive integer, not [$start]")
1113           unless $start =~ /^\d+$/ and $start > 0;
1114         $self->throw("Slice end has to be a positive integer, not [$end]")
1115           unless $end =~ /^\d+$/ and $end > 0;
1116         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1117           unless $start <= $end;
1118         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1119                                          "[$start] is too big.") if $start > $self->length;
1120     my $cons_meta = $self->consensus_meta;
1121         my $aln = $self->new;
1122         $aln->id($self->id);
1123         foreach my $seq ( $self->each_seq() ) {
1124             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1125             Bio::Seq::Meta->new
1126         (-id      => $seq->id,
1127                  -alphabet => $seq->alphabet,
1128                  -strand  => $seq->strand,
1129                  -verbose => $self->verbose) :
1130             Bio::LocatableSeq->new
1131         (-id      => $seq->id,
1132                  -alphabet => $seq->alphabet,
1133                  -strand  => $seq->strand,
1134                  -verbose => $self->verbose);
1135
1136             # seq
1137             my $seq_end = $end;
1138             $seq_end = $seq->length if( $end > $seq->length );
1139
1140             my $slice_seq = $seq->subseq($start, $seq_end);
1141             $new_seq->seq( $slice_seq );
1142
1143             $slice_seq =~ s/\W//g;
1144
1145             if ($start > 1) {
1146             my $pre_start_seq = $seq->subseq(1, $start - 1);
1147             $pre_start_seq =~ s/\W//g;
1148             if (!defined($seq->strand)) {
1149                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1150             } elsif ($seq->strand < 0){
1151                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1152             } else {
1153                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1154             }
1155             } else {
1156             if ((defined $seq->strand)&&($seq->strand < 0)){
1157                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1158             } else {
1159                $new_seq->start( $seq->start);
1160             }
1161             }
1162         if ($new_seq->isa('Bio::Seq::MetaI')) {
1163             for my $meta_name ($seq->meta_names) {
1164                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1165             }
1166         }
1167             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1168
1169             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1170             $aln->add_seq($new_seq);
1171             } else {
1172             if( $keep_gap_only ) {
1173                 $aln->add_seq($new_seq);
1174             } else {
1175                 my $nse = $seq->get_nse();
1176                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1177                     " Sequence excluded from the new alignment.");
1178             }
1179             }
1180         }
1181     if ($cons_meta) {
1182         my $new = Bio::Seq::Meta->new();
1183         for my $meta_name ($cons_meta->meta_names) {
1184             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1185         }
1186         $aln->consensus_meta($new);
1187     }
1188     $aln->annotation($self->annotation);
1189     # fix for meta, sf, ann
1190         return $aln;
1191 }
1192
1193 =head2 remove_columns
1194
1195  Title     : remove_columns
1196  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1197              $aln2 = $aln->remove_columns([0,0],[6,8])
1198  Function  : Creates an aligment with columns removed corresponding to
1199              the specified type or by specifying the columns by number.
1200  Returns   : Bio::SimpleAlign object
1201  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1202              'all_gaps_columns') or array ref where the referenced array
1203              contains a pair of integers that specify a range.
1204              The first column is 0
1205
1206 =cut
1207
1208 sub remove_columns {
1209     my ($self,@args) = @_;
1210     @args || $self->throw("Must supply column ranges or column types");
1211     my $aln;
1212
1213     if ($args[0][0] =~ /^[a-z_]+$/i) {
1214         $aln = $self->_remove_columns_by_type($args[0]);
1215     } elsif ($args[0][0] =~ /^\d+$/) {
1216         $aln = $self->_remove_columns_by_num(\@args);
1217     } else {
1218         $self->throw("You must pass array references to remove_columns(), not @args");
1219     }
1220     # fix for meta, sf, ann
1221     $aln;
1222 }
1223
1224
1225 =head2 remove_gaps
1226
1227  Title     : remove_gaps
1228  Usage     : $aln2 = $aln->remove_gaps
1229  Function  : Creates an aligment with gaps removed
1230  Returns   : a Bio::SimpleAlign object
1231  Args      : a gap character(optional) if none specified taken
1232                 from $self->gap_char,
1233              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1234                         indicates that only all-gaps columns should be deleted
1235
1236 Used from method L<remove_columns> in most cases. Set gap character
1237 using L<gap_char()|gap_char>.
1238
1239 =cut
1240
1241 sub remove_gaps {
1242     my ($self,$gapchar,$all_gaps_columns) = @_;
1243     my $gap_line;
1244     if ($all_gaps_columns) {
1245         $gap_line = $self->all_gap_line($gapchar);
1246     } else {
1247         $gap_line = $self->gap_line($gapchar);
1248     }
1249     my $aln = $self->new;
1250
1251     my @remove;
1252     my $length = 0;
1253     my $del_char = $gapchar || $self->gap_char;
1254     # Do the matching to get the segments to remove
1255     while ($gap_line =~ m/[$del_char]/g) {
1256         my $start = pos($gap_line)-1;
1257         $gap_line =~ m/\G[$del_char]+/gc;
1258         my $end = pos($gap_line)-1;
1259
1260         #have to offset the start and end for subsequent removes
1261         $start-=$length;
1262         $end  -=$length;
1263         $length += ($end-$start+1);
1264         push @remove, [$start,$end];
1265     }
1266
1267     #remove the segments
1268     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1269     # fix for meta, sf, ann
1270     return $aln;
1271 }
1272
1273
1274 sub _remove_col {
1275     my ($self,$aln,$remove) = @_;
1276     my @new;
1277
1278     my $gap = $self->gap_char;
1279
1280     # splice out the segments and create new seq
1281     foreach my $seq($self->each_seq){
1282         my $new_seq = Bio::LocatableSeq->new(
1283                                              -id      => $seq->id,
1284                                              -alphabet=> $seq->alphabet,
1285                                              -strand  => $seq->strand,
1286                                              -verbose => $self->verbose);
1287         my $sequence = $seq->seq;
1288         foreach my $pair(@{$remove}){
1289             my $start = $pair->[0];
1290             my $end   = $pair->[1];
1291             $sequence = $seq->seq unless $sequence;
1292             my $orig = $sequence;
1293             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1294             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1295             $sequence = $head.$tail;
1296             # start
1297             unless (defined $new_seq->start) {
1298                 if ($start == 0) {
1299                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1300                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1301                 }
1302                 else {
1303                     my $start_adjust = $orig =~ /^$gap+/;
1304                     if ($start_adjust) {
1305                         $start_adjust = $+[0] == $start;
1306                     }
1307                     $new_seq->start($seq->start + $start_adjust);
1308                 }
1309             }
1310             # end
1311             if (($end + 1) >= CORE::length($orig)) {
1312                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1313                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1314             }
1315             else {
1316                 $new_seq->end($seq->end);
1317             }
1318         }
1319
1320         if ($new_seq->end < $new_seq->start) {
1321             # we removed all columns except for gaps: set to 0 to indicate no
1322             # sequence
1323             $new_seq->start(0);
1324             $new_seq->end(0);
1325         }
1326
1327         $new_seq->seq($sequence) if $sequence;
1328                 push @new, $new_seq;
1329     }
1330     # add the new seqs to the alignment
1331     foreach my $new(@new){
1332         $aln->add_seq($new);
1333     }
1334     # fix for meta, sf, ann
1335     return $aln;
1336 }
1337
1338 sub _remove_columns_by_type {
1339         my ($self,$type) = @_;
1340         my $aln = $self->new;
1341         my @remove;
1342
1343         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1344         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1345         my %matchchars = ( 'match'           => '\*',
1346                        'weak'             => '\.',
1347                        'strong'           => ':',
1348                        'mismatch'         => ' ',
1349                        'gaps'             => '',
1350                        'all_gaps_columns' => ''
1351                      );
1352         # get the characters to delete against
1353         my $del_char;
1354         foreach my $type (@{$type}){
1355                 $del_char.= $matchchars{$type};
1356         }
1357
1358         my $length = 0;
1359         my $match_line = $self->match_line;
1360         # do the matching to get the segments to remove
1361         if($del_char){
1362                 while($match_line =~ m/[$del_char]/g ){
1363                         my $start = pos($match_line)-1;
1364                         $match_line=~/\G[$del_char]+/gc;
1365                         my $end = pos($match_line)-1;
1366
1367                         #have to offset the start and end for subsequent removes
1368                         $start-=$length;
1369                         $end  -=$length;
1370                         $length += ($end-$start+1);
1371                         push @remove, [$start,$end];
1372                 }
1373         }
1374
1375         # remove the segments
1376         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1377         $aln = $aln->remove_gaps() if $gap;
1378         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1379     # fix for meta, sf, ann
1380         $aln;
1381 }
1382
1383
1384 sub _remove_columns_by_num {
1385         my ($self,$positions) = @_;
1386         my $aln = $self->new;
1387
1388         # sort the positions
1389         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1390
1391     my @remove;
1392     my $length = 0;
1393     foreach my $pos (@{$positions}) {
1394         my ($start, $end) = @{$pos};
1395
1396         #have to offset the start and end for subsequent removes
1397         $start-=$length;
1398         $end  -=$length;
1399         $length += ($end-$start+1);
1400         push @remove, [$start,$end];
1401     }
1402
1403     #remove the segments
1404     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1405     # fix for meta, sf, ann
1406         $aln;
1407 }
1408
1409
1410 =head1 Change sequences within the MSA
1411
1412 These methods affect characters in all sequences without changing the
1413 alignment.
1414
1415 =head2 splice_by_seq_pos
1416
1417  Title   : splice_by_seq_pos
1418  Usage   : $status = splice_by_seq_pos(1);
1419  Function: splices all aligned sequences where the specified sequence
1420            has gaps.
1421  Example :
1422  Returns : 1 on success
1423  Args    : position of sequence to splice by
1424
1425
1426 =cut
1427
1428 sub splice_by_seq_pos{
1429   my ($self,$pos) = @_;
1430
1431   my $guide = $self->get_seq_by_pos($pos);
1432   my $guide_seq = $guide->seq;
1433
1434   $guide_seq =~ s/\./\-/g;
1435
1436   my @gaps = ();
1437   $pos = -1;
1438   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1439     unshift @gaps, $pos;
1440     $pos++;
1441   }
1442
1443   foreach my $seq ($self->each_seq){
1444     my @bases = split '', $seq->seq;
1445
1446     splice(@bases, $_, 1) foreach @gaps;
1447     $seq->seq(join('', @bases));
1448   }
1449
1450   1;
1451 }
1452
1453 =head2 map_chars
1454
1455  Title     : map_chars
1456  Usage     : $ali->map_chars('\.','-')
1457  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1458              characters.
1459
1460              Note that the first argument is interpreted as a regexp
1461              so be careful and escape any wild card characters (e.g.
1462              do $ali->map_chars('\.','-') to replace periods with dashes.
1463  Returns   :
1464  Argument  : A regexp and a string
1465
1466 =cut
1467
1468 sub map_chars {
1469     my $self = shift;
1470     my $from = shift;
1471     my $to   = shift;
1472     my ( $seq, $temp );
1473
1474     $self->throw("Need two arguments: a regexp and a string")
1475       unless defined $from and defined $to;
1476
1477     foreach $seq ( $self->each_seq() ) {
1478         $temp = $seq->seq();
1479         $temp =~ s/$from/$to/g;
1480         $seq->seq($temp);
1481     }
1482     return 1;
1483 }
1484
1485
1486 =head2 uppercase
1487
1488  Title     : uppercase()
1489  Usage     : $ali->uppercase()
1490  Function  : Sets all the sequences to uppercase
1491  Returns   :
1492  Argument  :
1493
1494 =cut
1495
1496 sub uppercase {
1497     my $self = shift;
1498     my $seq;
1499     my $temp;
1500
1501     foreach $seq ( $self->each_seq() ) {
1502       $temp = $seq->seq();
1503       $temp =~ tr/[a-z]/[A-Z]/;
1504
1505       $seq->seq($temp);
1506     }
1507     return 1;
1508 }
1509
1510 =head2 cigar_line
1511
1512  Title    : cigar_line()
1513  Usage    : %cigars = $align->cigar_line()
1514  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1515             Report) line for each sequence in the alignment. Examples are
1516             "1,60" or "5,10:12,58", where the numbers refer to conserved
1517             positions within the alignment. The keys of the hash are the
1518             NSEs (name/start/end) assigned to each sequence.
1519  Args     : threshold (optional, defaults to 100)
1520  Returns  : Hash of strings (cigar lines)
1521
1522 =cut
1523
1524 sub cigar_line {
1525         my $self = shift;
1526         my $thr=shift||100;
1527         my %cigars;
1528
1529         my @consensus = split "",($self->consensus_string($thr));
1530         my $len = $self->length;
1531         my $gapchar = $self->gap_char;
1532
1533         # create a precursor, something like (1,4,5,6,7,33,45),
1534         # where each number corresponds to a conserved position
1535         foreach my $seq ( $self->each_seq ) {
1536                 my @seq = split "", uc ($seq->seq);
1537                 my $pos = 1;
1538                 for (my $x = 0 ; $x < $len ; $x++ ) {
1539                         if ($seq[$x] eq $consensus[$x]) {
1540                                 push @{$cigars{$seq->get_nse}},$pos;
1541                                 $pos++;
1542                         } elsif ($seq[$x] ne $gapchar) {
1543                                 $pos++;
1544                         }
1545                 }
1546         }
1547         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1548         for my $name (keys %cigars) {
1549                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1550                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1551       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1552            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1553                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1554                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1555                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1556                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1557                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1558                         }
1559       }
1560         }
1561   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1562   for my $name (keys %cigars) {
1563           my @remove;
1564           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1565                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1566                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1567                       unshift @remove,$x;
1568               }
1569            }
1570       for my $pos (@remove) {
1571                         splice @{$cigars{$name}}, $pos, 1;
1572            }
1573    }
1574    # join and punctuate
1575    for my $name (keys %cigars) {
1576           my ($start,$end,$str) = "";
1577           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1578                   $str .= ($start . "," . $end . ":");
1579           }
1580           $str =~ s/:$//;
1581       $cigars{$name} = $str;
1582    }
1583    %cigars;
1584 }
1585
1586
1587 =head2 match_line
1588
1589  Title    : match_line()
1590  Usage    : $line = $align->match_line()
1591  Function : Generates a match line - much like consensus string
1592             except that a line indicating the '*' for a match.
1593  Args     : (optional) Match line characters ('*' by default)
1594             (optional) Strong match char (':' by default)
1595             (optional) Weak match char ('.' by default)
1596  Returns  : String
1597
1598 =cut
1599
1600 sub match_line {
1601         my ($self,$matchlinechar, $strong, $weak) = @_;
1602         my %matchchars = ('match'    => $matchlinechar || '*',
1603                                                           'weak'     => $weak          || '.',
1604                                                           'strong'   => $strong        || ':',
1605                                                           'mismatch' => ' ',
1606                                                   );
1607
1608         my @seqchars;
1609         my $alphabet;
1610         foreach my $seq ( $self->each_seq ) {
1611                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1612                 $alphabet = $seq->alphabet unless defined $alphabet;
1613         }
1614         my $refseq = shift @seqchars;
1615         # let's just march down the columns
1616         my $matchline;
1617  POS:
1618         foreach my $pos ( 0..$self->length ) {
1619                 my $refchar = $refseq->[$pos];
1620                 my $char = $matchchars{'mismatch'};
1621                 unless( defined $refchar ) {
1622                         last if $pos == $self->length; # short circuit on last residue
1623                         # this in place to handle jason's soon-to-be-committed
1624                         # intron mapping code
1625                         goto bottom;
1626                 }
1627                 my %col = ($refchar => 1);
1628                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1629                 foreach my $seq ( @seqchars ) {
1630                         next if $pos >= scalar @$seq;
1631                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1632                                                           $seq->[$pos] eq ' ' );
1633                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1634                 }
1635                 my @colresidues = sort keys %col;
1636
1637                 # if all the values are the same
1638                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1639                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1640                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1641                         # matches for protein seqs
1642             TYPE:
1643                         foreach my $type ( qw(strong weak) ) {
1644                                 # iterate through categories
1645                                 my %groups;
1646                                 # iterate through each of the aa in the col
1647                                 # look to see which groups it is in
1648                                 foreach my $c ( @colresidues ) {
1649                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1650                                                 push @{$groups{$f}},$c;
1651                                         }
1652                                 }
1653                          GRP:
1654                                 foreach my $cols ( values %groups ) {
1655                                         @$cols = sort @$cols;
1656                                         # now we are just testing to see if two arrays
1657                                         # are identical w/o changing either one
1658                                         # have to be same len
1659                                         next if( scalar @$cols != scalar @colresidues );
1660                                         # walk down the length and check each slot
1661                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1662                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1663                                         }
1664                                         $char = $matchchars{$type};
1665                                         last TYPE;
1666                                 }
1667                         }
1668                 }
1669          bottom:
1670                 $matchline .= $char;
1671         }
1672         return $matchline;
1673 }
1674
1675
1676 =head2 gap_line
1677
1678  Title    : gap_line()
1679  Usage    : $line = $align->gap_line()
1680  Function : Generates a gap line - much like consensus string
1681             except that a line where '-' represents gap
1682  Args     : (optional) gap line characters ('-' by default)
1683  Returns  : string
1684
1685 =cut
1686
1687 sub gap_line {
1688     my ($self,$gapchar) = @_;
1689     $gapchar = $gapchar || $self->gap_char;
1690     my %gap_hsh; # column gaps vector
1691     foreach my $seq ( $self->each_seq ) {
1692                 my $i = 0;
1693         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1694                   map {[$i++, $_]} split(//, uc ($seq->seq));
1695     }
1696     my $gap_line;
1697     foreach my $pos ( 0..$self->length-1 ) {
1698           $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1699     }
1700     return $gap_line;
1701 }
1702
1703 =head2 all_gap_line
1704
1705  Title    : all_gap_line()
1706  Usage    : $line = $align->all_gap_line()
1707  Function : Generates a gap line - much like consensus string
1708             except that a line where '-' represents all-gap column
1709  Args     : (optional) gap line characters ('-' by default)
1710  Returns  : string
1711
1712 =cut
1713
1714 sub all_gap_line {
1715     my ($self,$gapchar) = @_;
1716     $gapchar = $gapchar || $self->gap_char;
1717     my %gap_hsh;                # column gaps counter hash
1718     my @seqs = $self->each_seq;
1719     foreach my $seq ( @seqs ) {
1720         my $i = 0;
1721         map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1722         map {[$i++, $_]} split(//, uc ($seq->seq));
1723     }
1724     my $gap_line;
1725     foreach my $pos ( 0..$self->length-1 ) {
1726         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1727             # gaps column
1728             $gap_line .= $self->gap_char;
1729         } else {
1730             $gap_line .= '.';
1731         }
1732     }
1733     return $gap_line;
1734 }
1735
1736 =head2 gap_col_matrix
1737
1738  Title    : gap_col_matrix()
1739  Usage    : my $cols = $align->gap_col_matrix()
1740  Function : Generates an array where each element in the array is a
1741             hash reference with a key of the sequence name and a
1742             value of 1 if the sequence has a gap at that column
1743  Returns  : Reference to an array
1744  Args     : Optional: gap line character ($aln->gap_char or '-' by default)
1745
1746 =cut
1747
1748 sub gap_col_matrix {
1749     my ( $self, $gapchar ) = @_;
1750     $gapchar = $gapchar || $self->gap_char;
1751     my %gap_hsh;    # column gaps vector
1752     my @cols;
1753     foreach my $seq ( $self->each_seq ) {
1754         my $i   = 0;
1755         my $str = $seq->seq;
1756         my $len = $seq->length;
1757         my $ch;
1758         my $id = $seq->display_id;
1759         while ( $i < $len ) {
1760             $ch = substr( $str, $i, 1 );
1761             $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1762         }
1763     }
1764     return \@cols;
1765 }
1766
1767 =head2 match
1768
1769  Title     : match()
1770  Usage     : $ali->match()
1771  Function  : Goes through all columns and changes residues that are
1772              identical to residue in first sequence to match '.'
1773              character. Sets match_char.
1774
1775              USE WITH CARE: Most MSA formats do not support match
1776              characters in sequences, so this is mostly for output
1777              only. NEXUS format (Bio::AlignIO::nexus) can handle
1778              it.
1779  Returns   : 1
1780  Argument  : a match character, optional, defaults to '.'
1781
1782 =cut
1783
1784 sub match {
1785     my ($self, $match) = @_;
1786
1787     $match ||= '.';
1788     my ($matching_char) = $match;
1789     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1790     $self->map_chars($matching_char, '-');
1791
1792     my @seqs = $self->each_seq();
1793     return 1 unless scalar @seqs > 1;
1794
1795     my $refseq = shift @seqs ;
1796     my @refseq = split //, $refseq->seq;
1797     my $gapchar = $self->gap_char;
1798
1799     foreach my $seq ( @seqs ) {
1800         my @varseq = split //, $seq->seq();
1801         for ( my $i=0; $i < scalar @varseq; $i++) {
1802             $varseq[$i] = $match if defined $refseq[$i] &&
1803                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1804                   $refseq[$i] =~ /$gapchar/ )
1805                       && $refseq[$i] eq $varseq[$i];
1806         }
1807         $seq->seq(join '', @varseq);
1808     }
1809     $self->match_char($match);
1810     return 1;
1811 }
1812
1813
1814 =head2 unmatch
1815
1816  Title     : unmatch()
1817  Usage     : $ali->unmatch()
1818  Function  : Undoes the effect of method match. Unsets match_char.
1819  Returns   : 1
1820  Argument  : a match character, optional, defaults to '.'
1821
1822 See L<match> and L<match_char>
1823
1824 =cut
1825
1826 sub unmatch {
1827     my ($self, $match) = @_;
1828
1829     $match ||= '.';
1830
1831     my @seqs = $self->each_seq();
1832     return 1 unless scalar @seqs > 1;
1833
1834     my $refseq = shift @seqs ;
1835     my @refseq = split //, $refseq->seq;
1836     my $gapchar = $self->gap_char;
1837     foreach my $seq ( @seqs ) {
1838         my @varseq = split //, $seq->seq();
1839         for ( my $i=0; $i < scalar @varseq; $i++) {
1840             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1841                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1842                   $refseq[$i] =~ /$gapchar/ ) &&
1843                       $varseq[$i] eq $match;
1844         }
1845         $seq->seq(join '', @varseq);
1846     }
1847     $self->match_char('');
1848     return 1;
1849 }
1850
1851 =head1 MSA attributes
1852
1853 Methods for setting and reading the MSA attributes.
1854
1855 Note that the methods defining character semantics depend on the user
1856 to set them sensibly.  They are needed only by certain input/output
1857 methods. Unset them by setting to an empty string ('').
1858
1859 =head2 id
1860
1861  Title     : id
1862  Usage     : $myalign->id("Ig")
1863  Function  : Gets/sets the id field of the alignment
1864  Returns   : An id string
1865  Argument  : An id string (optional)
1866
1867 =cut
1868
1869 sub id {
1870     my ($self, $name) = @_;
1871
1872     if (defined( $name )) {
1873         $self->{'_id'} = $name;
1874     }
1875
1876     return $self->{'_id'};
1877 }
1878
1879 =head2 accession
1880
1881  Title     : accession
1882  Usage     : $myalign->accession("PF00244")
1883  Function  : Gets/sets the accession field of the alignment
1884  Returns   : An acc string
1885  Argument  : An acc string (optional)
1886
1887 =cut
1888
1889 sub accession {
1890     my ($self, $acc) = @_;
1891
1892     if (defined( $acc )) {
1893         $self->{'_accession'} = $acc;
1894     }
1895
1896     return $self->{'_accession'};
1897 }
1898
1899 =head2 description
1900
1901  Title     : description
1902  Usage     : $myalign->description("14-3-3 proteins")
1903  Function  : Gets/sets the description field of the alignment
1904  Returns   : An description string
1905  Argument  : An description string (optional)
1906
1907 =cut
1908
1909 sub description {
1910     my ($self, $name) = @_;
1911
1912     if (defined( $name )) {
1913         $self->{'_description'} = $name;
1914     }
1915
1916     return $self->{'_description'};
1917 }
1918
1919 =head2 missing_char
1920
1921  Title     : missing_char
1922  Usage     : $myalign->missing_char("?")
1923  Function  : Gets/sets the missing_char attribute of the alignment
1924              It is generally recommended to set it to 'n' or 'N'
1925              for nucleotides and to 'X' for protein.
1926  Returns   : An missing_char string,
1927  Argument  : An missing_char string (optional)
1928
1929 =cut
1930
1931 sub missing_char {
1932     my ($self, $char) = @_;
1933
1934     if (defined $char ) {
1935         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1936         $self->{'_missing_char'} = $char;
1937     }
1938
1939     return $self->{'_missing_char'};
1940 }
1941
1942 =head2 match_char
1943
1944  Title     : match_char
1945  Usage     : $myalign->match_char('.')
1946  Function  : Gets/sets the match_char attribute of the alignment
1947  Returns   : An match_char string,
1948  Argument  : An match_char string (optional)
1949
1950 =cut
1951
1952 sub match_char {
1953     my ($self, $char) = @_;
1954
1955     if (defined $char ) {
1956         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1957         $self->{'_match_char'} = $char;
1958     }
1959
1960     return $self->{'_match_char'};
1961 }
1962
1963 =head2 gap_char
1964
1965  Title     : gap_char
1966  Usage     : $myalign->gap_char('-')
1967  Function  : Gets/sets the gap_char attribute of the alignment
1968  Returns   : An gap_char string, defaults to '-'
1969  Argument  : An gap_char string (optional)
1970
1971 =cut
1972
1973 sub gap_char {
1974     my ($self, $char) = @_;
1975
1976     if (defined $char || ! defined $self->{'_gap_char'} ) {
1977         $char= '-' unless defined $char;
1978         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1979         $self->{'_gap_char'} = $char;
1980     }
1981     return $self->{'_gap_char'};
1982 }
1983
1984 =head2 symbol_chars
1985
1986  Title   : symbol_chars
1987  Usage   : my @symbolchars = $aln->symbol_chars;
1988  Function: Returns all the seen symbols (other than gaps)
1989  Returns : array of characters that are the seen symbols
1990  Args    : boolean to include the gap/missing/match characters
1991
1992 =cut
1993
1994 sub symbol_chars{
1995    my ($self,$includeextra) = @_;
1996
1997    unless ($self->{'_symbols'}) {
1998        foreach my $seq ($self->each_seq) {
1999            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
2000        }
2001    }
2002    my %copy = %{$self->{'_symbols'}};
2003    if( ! $includeextra ) {
2004        foreach my $char ( $self->gap_char, $self->match_char,
2005                           $self->missing_char) {
2006            delete $copy{$char} if( defined $char );
2007        }
2008    }
2009    return keys %copy;
2010 }
2011
2012 =head1 Alignment descriptors
2013
2014 These read only methods describe the MSA in various ways.
2015
2016
2017 =head2 score
2018
2019  Title     : score
2020  Usage     : $str = $ali->score()
2021  Function  : get/set a score of the alignment
2022  Returns   : a score for the alignment
2023  Argument  : an optional score to set
2024
2025 =cut
2026
2027 sub score {
2028   my $self = shift;
2029   $self->{score} = shift if @_;
2030   return $self->{score};
2031 }
2032
2033 =head2 consensus_string
2034
2035  Title     : consensus_string
2036  Usage     : $str = $ali->consensus_string($threshold_percent)
2037  Function  : Makes a strict consensus
2038  Returns   : Consensus string
2039  Argument  : Optional threshold ranging from 0 to 100.
2040              The consensus residue has to appear at least threshold %
2041              of the sequences at a given location, otherwise a '?'
2042              character will be placed at that location.
2043              (Default value = 0%)
2044
2045 =cut
2046
2047 sub consensus_string {
2048     my $self = shift;
2049     my $threshold = shift;
2050
2051     my $out = "";
2052     my $len = $self->length - 1;
2053
2054     foreach ( 0 .. $len ) {
2055         $out .= $self->_consensus_aa($_,$threshold);
2056     }
2057     return $out;
2058 }
2059
2060 =head2 consensus_conservation
2061
2062  Title     : consensus_conservation
2063  Usage     : @conservation = $ali->consensus_conservation();
2064  Function  : Conservation (as a percent) of each position of alignment
2065  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2066  Argument  :
2067
2068 =cut
2069
2070 sub consensus_conservation {
2071     my $self = shift;
2072     my @cons;
2073     my $num_sequences = $self->num_sequences;
2074     foreach my $point (0..$self->length-1) {
2075         my %hash = $self->_consensus_counts($point);
2076         # max frequency of a non-gap letter
2077         my $max = (sort {$b<=>$a} values %hash )[0];
2078         push @cons, 100 * $max / $num_sequences;
2079     }
2080     return @cons;
2081 }
2082
2083 sub _consensus_aa {
2084     my $self = shift;
2085     my $point = shift;
2086     my $threshold_percent = shift || -1 ;
2087     my ($seq,%hash,$count,$letter,$key);
2088     my $gapchar = $self->gap_char;
2089     %hash = $self->_consensus_counts($point);
2090     my $number_of_sequences = $self->num_sequences();
2091     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2092     $count = -1;
2093     $letter = '?';
2094
2095     foreach $key ( sort keys %hash ) {
2096         # print "Now at $key $hash{$key}\n";
2097         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2098             $letter = $key;
2099             $count = $hash{$key};
2100         }
2101     }
2102     return $letter;
2103 }
2104
2105 # Frequency of each letter in one column
2106 sub _consensus_counts {
2107     my $self = shift;
2108     my $point = shift;
2109     my %hash;
2110     my $gapchar = $self->gap_char;
2111     foreach my $seq ( $self->each_seq() ) {
2112         my $letter = substr($seq->seq,$point,1);
2113         $self->throw("--$point-----------") if $letter eq '';
2114         ($letter eq $gapchar || $letter =~ /\./) && next;
2115         $hash{$letter}++;
2116     }
2117     return %hash;
2118 }
2119
2120
2121 =head2 consensus_iupac
2122
2123  Title     : consensus_iupac
2124  Usage     : $str = $ali->consensus_iupac()
2125  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2126              and RNA. The output is in upper case except when gaps in
2127              a column force output to be in lower case.
2128
2129              Note that if your alignment sequences contain a lot of
2130              IUPAC ambiquity codes you often have to manually set
2131              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2132              indicate a protein sequence.
2133  Returns   : consensus string
2134  Argument  : none
2135  Throws    : on protein sequences
2136
2137 =cut
2138
2139 sub consensus_iupac {
2140     my $self = shift;
2141     my $out = "";
2142     my $len = $self->length-1;
2143
2144     # only DNA and RNA sequences are valid
2145     foreach my $seq ( $self->each_seq() ) {
2146         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2147             if $seq->alphabet eq 'protein';
2148     }
2149     # loop over the alignment columns
2150     foreach my $count ( 0 .. $len ) {
2151         $out .= $self->_consensus_iupac($count);
2152     }
2153     return $out;
2154 }
2155
2156 sub _consensus_iupac {
2157     my ($self, $column) = @_;
2158     my ($string, $char, $rna);
2159
2160     #determine all residues in a column
2161     foreach my $seq ( $self->each_seq() ) {
2162         $string .= substr($seq->seq, $column, 1);
2163     }
2164     $string = uc $string;
2165
2166     # quick exit if there's an N in the string
2167     if ($string =~ /N/) {
2168         $string =~ /\W/ ? return 'n' : return 'N';
2169     }
2170     # ... or if there are only gap characters
2171     return '-' if $string =~ /^\W+$/;
2172
2173     # treat RNA as DNA in regexps
2174     if ($string =~ /U/) {
2175         $string =~ s/U/T/;
2176         $rna = 1;
2177     }
2178
2179     # the following s///'s only need to be done to the _first_ ambiguity code
2180     # as we only need to see the _range_ of characters in $string
2181
2182     if ($string =~ /[VDHB]/) {
2183         $string =~ s/V/AGC/;
2184         $string =~ s/D/AGT/;
2185         $string =~ s/H/ACT/;
2186         $string =~ s/B/CTG/;
2187     }
2188
2189     if ($string =~ /[SKYRWM]/) {
2190         $string =~ s/S/GC/;
2191         $string =~ s/K/GT/;
2192         $string =~ s/Y/CT/;
2193         $string =~ s/R/AG/;
2194         $string =~ s/W/AT/;
2195         $string =~ s/M/AC/;
2196     }
2197
2198     # and now the guts of the thing
2199
2200     if ($string =~ /A/) {
2201         $char = 'A';                     # A                      A
2202         if ($string =~ /G/) {
2203             $char = 'R';                 # A and G (purines)      R
2204             if ($string =~ /C/) {
2205                 $char = 'V';             # A and G and C          V
2206                 if ($string =~ /T/) {
2207                     $char = 'N';         # A and G and C and T    N
2208                 }
2209             } elsif ($string =~ /T/) {
2210                 $char = 'D';             # A and G and T          D
2211             }
2212         } elsif ($string =~ /C/) {
2213             $char = 'M';                 # A and C                M
2214             if ($string =~ /T/) {
2215                 $char = 'H';             # A and C and T          H
2216             }
2217         } elsif ($string =~ /T/) {
2218             $char = 'W';                 # A and T                W
2219         }
2220     } elsif ($string =~ /C/) {
2221         $char = 'C';                     # C                      C
2222         if ($string =~ /T/) {
2223             $char = 'Y';                 # C and T (pyrimidines)  Y
2224             if ($string =~ /G/) {
2225                 $char = 'B';             # C and T and G          B
2226             }
2227         } elsif ($string =~ /G/) {
2228             $char = 'S';                 # C and G                S
2229         }
2230     } elsif ($string =~ /G/) {
2231         $char = 'G';                     # G                      G
2232         if ($string =~ /C/) {
2233             $char = 'S';                 # G and C                S
2234         } elsif ($string =~ /T/) {
2235             $char = 'K';                 # G and T                K
2236         }
2237     } elsif ($string =~ /T/) {
2238         $char = 'T';                     # T                      T
2239     }
2240
2241     $char = 'U' if $rna and $char eq 'T';
2242     $char = lc $char if $string =~ /\W/;
2243
2244     return $char;
2245 }
2246
2247
2248 =head2 consensus_meta
2249
2250  Title     : consensus_meta
2251  Usage     : $seqmeta = $ali->consensus_meta()
2252  Function  : Returns a Bio::Seq::Meta object containing the consensus
2253              strings derived from meta data analysis.
2254  Returns   : Bio::Seq::Meta
2255  Argument  : Bio::Seq::Meta
2256  Throws    : non-MetaI object
2257
2258 =cut
2259
2260 sub consensus_meta {
2261     my ($self, $meta) = @_;
2262     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2263         $self->throw('Not a Bio::Seq::MetaI object');
2264     }
2265     return $self->{'_aln_meta'} = $meta if $meta;
2266     return $self->{'_aln_meta'}
2267 }
2268
2269 =head2 is_flush
2270
2271  Title     : is_flush
2272  Usage     : if ( $ali->is_flush() )
2273  Function  : Tells you whether the alignment
2274            : is flush, i.e. all of the same length
2275  Returns   : 1 or 0
2276  Argument  :
2277
2278 =cut
2279
2280 sub is_flush {
2281     my ($self,$report) = @_;
2282     my $seq;
2283     my $length = (-1);
2284     my $temp;
2285
2286     foreach $seq ( $self->each_seq() ) {
2287         if( $length == (-1) ) {
2288             $length = CORE::length($seq->seq());
2289             next;
2290         }
2291
2292         $temp = CORE::length($seq->seq());
2293         if( $temp != $length ) {
2294             $self->warn("expecting $length not $temp from ".
2295                         $seq->display_id) if( $report );
2296             $self->debug("expecting $length not $temp from ".
2297                          $seq->display_id);
2298             $self->debug($seq->seq(). "\n");
2299             return 0;
2300         }
2301     }
2302
2303     return 1;
2304 }
2305
2306
2307 =head2 length
2308
2309  Title     : length()
2310  Usage     : $len = $ali->length()
2311  Function  : Returns the maximum length of the alignment.
2312              To be sure the alignment is a block, use is_flush
2313  Returns   : Integer
2314  Argument  :
2315
2316 =cut
2317
2318 sub length_aln {
2319     my $self = shift;
2320     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2321     $self->length(@_);
2322 }
2323
2324 sub length {
2325     my $self = shift;
2326     my $seq;
2327     my $length = -1;
2328     my $temp;
2329
2330     foreach $seq ( $self->each_seq() ) {
2331         $temp = $seq->length();
2332         if( $temp > $length ) {
2333             $length = $temp;
2334         }
2335     }
2336
2337     return $length;
2338 }
2339
2340
2341 =head2 maxdisplayname_length
2342
2343  Title     : maxdisplayname_length
2344  Usage     : $ali->maxdisplayname_length()
2345  Function  : Gets the maximum length of the displayname in the
2346              alignment. Used in writing out various MSA formats.
2347  Returns   : integer
2348  Argument  :
2349
2350 =cut
2351
2352 sub maxname_length {
2353     my $self = shift;
2354     $self->deprecated("maxname_length - deprecated method.".
2355                       " Use maxdisplayname_length() instead.");
2356     $self->maxdisplayname_length();
2357 }
2358
2359 sub maxnse_length {
2360     my $self = shift;
2361     $self->deprecated("maxnse_length - deprecated method.".
2362                       " Use maxnse_length() instead.");
2363     $self->maxdisplayname_length();
2364 }
2365
2366 sub maxdisplayname_length {
2367     my $self = shift;
2368     my $maxname = (-1);
2369     my ($seq,$len);
2370
2371     foreach $seq ( $self->each_seq() ) {
2372         $len = CORE::length $self->displayname($seq->get_nse());
2373
2374         if( $len > $maxname ) {
2375             $maxname = $len;
2376         }
2377     }
2378
2379     return $maxname;
2380 }
2381
2382 =head2 max_metaname_length
2383
2384  Title     : max_metaname_length
2385  Usage     : $ali->max_metaname_length()
2386  Function  : Gets the maximum length of the meta name tags in the
2387              alignment for the sequences and for the alignment.
2388              Used in writing out various MSA formats.
2389  Returns   : integer
2390  Argument  : None
2391
2392 =cut
2393
2394 sub max_metaname_length {
2395     my $self = shift;
2396     my $maxname = (-1);
2397     my ($seq,$len);
2398
2399     # check seq meta first
2400     for $seq ( $self->each_seq() ) {
2401         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2402         for my $mtag ($seq->meta_names) {
2403             $len = CORE::length $mtag;
2404             if( $len > $maxname ) {
2405                 $maxname = $len;
2406             }
2407         }
2408     }
2409
2410     # alignment meta
2411     for my $meta ($self->consensus_meta) {
2412         next unless $meta;
2413         for my $name ($meta->meta_names) {
2414             $len = CORE::length $name;
2415             if( $len > $maxname ) {
2416                 $maxname = $len;
2417             }
2418         }
2419     }
2420
2421     return $maxname;
2422 }
2423
2424 =head2 num_residues
2425
2426  Title     : num_residues
2427  Usage     : $no = $ali->num_residues
2428  Function  : number of residues in total in the alignment
2429  Returns   : integer
2430  Argument  :
2431  Note      : replaces no_residues()
2432
2433 =cut
2434
2435 sub num_residues {
2436     my $self = shift;
2437     my $count = 0;
2438
2439     foreach my $seq ($self->each_seq) {
2440         my $str = $seq->seq();
2441
2442         $count += ($str =~ s/[A-Za-z]//g);
2443     }
2444
2445     return $count;
2446 }
2447
2448 =head2 num_sequences
2449
2450  Title     : num_sequences
2451  Usage     : $depth = $ali->num_sequences
2452  Function  : number of sequence in the sequence alignment
2453  Returns   : integer
2454  Argument  : none
2455  Note      : replaces no_sequences()
2456
2457 =cut
2458
2459 sub num_sequences {
2460     my $self = shift;
2461     return scalar($self->each_seq);
2462 }
2463
2464
2465 =head2 average_percentage_identity
2466
2467  Title   : average_percentage_identity
2468  Usage   : $id = $align->average_percentage_identity
2469  Function: The function uses a fast method to calculate the average
2470            percentage identity of the alignment
2471  Returns : The average percentage identity of the alignment
2472  Args    : None
2473  Notes   : This method implemented by Kevin Howe calculates a figure that is
2474            designed to be similar to the average pairwise identity of the
2475            alignment (identical in the absence of gaps), without having to
2476            explicitly calculate pairwise identities proposed by Richard Durbin.
2477            Validated by Ewan Birney ad Alex Bateman.
2478
2479 =cut
2480
2481 sub average_percentage_identity{
2482    my ($self,@args) = @_;
2483
2484    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2485                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2486
2487    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2488
2489    if (! $self->is_flush()) {
2490        $self->throw("All sequences in the alignment must be the same length");
2491    }
2492
2493    @seqs = $self->each_seq();
2494    $len = $self->length();
2495
2496    # load the each hash with correct keys for existence checks
2497
2498    for( my $index=0; $index < $len; $index++) {
2499        foreach my $letter (@alphabet) {
2500            $countHashes[$index]->{$letter} = 0;
2501        }
2502    }
2503    foreach my $seq (@seqs)  {
2504        my @seqChars = split //, $seq->seq();
2505        for( my $column=0; $column < @seqChars; $column++ ) {
2506            my $char = uc($seqChars[$column]);
2507            if (exists $countHashes[$column]->{$char}) {
2508                $countHashes[$column]->{$char}++;
2509            }
2510        }
2511    }
2512
2513    $total = 0;
2514    $divisor = 0;
2515    for(my $column =0; $column < $len; $column++) {
2516        my %hash = %{$countHashes[$column]};
2517        $subdivisor = 0;
2518        foreach my $res (keys %hash) {
2519            $total += $hash{$res}*($hash{$res} - 1);
2520            $subdivisor += $hash{$res};
2521        }
2522        $divisor += $subdivisor * ($subdivisor - 1);
2523    }
2524    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2525 }
2526
2527 =head2 percentage_identity
2528
2529  Title   : percentage_identity
2530  Usage   : $id = $align->percentage_identity
2531  Function: The function calculates the average percentage identity
2532            (aliased to average_percentage_identity)
2533  Returns : The average percentage identity
2534  Args    : None
2535
2536 =cut
2537
2538 sub percentage_identity {
2539     my $self = shift;
2540     return $self->average_percentage_identity();
2541 }
2542
2543 =head2 overall_percentage_identity
2544
2545  Title   : overall_percentage_identity
2546  Usage   : $id = $align->overall_percentage_identity
2547            $id = $align->overall_percentage_identity('short')
2548  Function: The function calculates the percentage identity of
2549            the conserved columns
2550  Returns : The percentage identity of the conserved columns
2551  Args    : length value to use, optional defaults to alignment length
2552                  possible values: 'align', 'short', 'long'
2553
2554 The argument values 'short' and 'long' refer to shortest and longest
2555 sequence in the alignment. Method modification code by Hongyu Zhang.
2556
2557 =cut
2558
2559 sub overall_percentage_identity{
2560    my ($self, $length_measure) = @_;
2561
2562    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2563
2564    my %enum = map {$_ => undef} qw (align short long);
2565
2566    $self->throw("Unknown argument [$length_measure]")
2567        if $length_measure and not exists $enum{$length_measure};
2568    $length_measure ||= 'align';
2569
2570    if (! $self->is_flush()) {
2571        $self->throw("All sequences in the alignment must be the same length");
2572    }
2573
2574    # Count the residues seen at each position
2575    my $len;
2576    my $total = 0; # number of positions with identical residues
2577    my @countHashes;
2578    my @seqs = $self->each_seq;
2579    my $nof_seqs = scalar @seqs;
2580    my $aln_len = $self->length();
2581    for my $seq (@seqs)  {
2582        my $seqstr = $seq->seq;
2583
2584        # Count residues for given sequence
2585        for my $column (0 .. $aln_len-1) {
2586            my $char = uc( substr($seqstr, $column, 1) );
2587            if ( exists $alphabet{$char} ) {
2588
2589                # This is a valid char
2590                if ( defined $countHashes[$column]->{$char} ) {
2591                  $countHashes[$column]->{$char}++;
2592                } else {
2593                  $countHashes[$column]->{$char} = 1;
2594                }
2595
2596                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2597                    # All sequences have this same residue
2598                    $total++;
2599                }
2600
2601            }
2602        }
2603
2604        # Sequence length
2605        if ($length_measure eq 'short' || $length_measure eq 'long') {
2606            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2607            if ($length_measure eq 'short') {
2608                if ( (not defined $len) || ($seq_len < $len) ) {
2609                    $len = $seq_len;
2610                }
2611            } elsif ($length_measure eq 'long') {
2612                if ( (not defined $len) || ($seq_len > $len) ) {
2613                    $len = $seq_len;
2614                }
2615            }
2616        }
2617
2618    }
2619
2620    if ($length_measure eq 'align') {
2621        $len = $aln_len;
2622    }
2623
2624    return ($total / $len ) * 100.0;
2625 }
2626
2627
2628
2629 =head1 Alignment positions
2630
2631 Methods to map a sequence position into an alignment column and back.
2632 column_from_residue_number() does the former. The latter is really a
2633 property of the sequence object and can done using
2634 L<Bio::LocatableSeq::location_from_column>:
2635
2636     # select somehow a sequence from the alignment, e.g.
2637     my $seq = $aln->get_seq_by_pos(1);
2638     #$loc is undef or Bio::LocationI object
2639     my $loc = $seq->location_from_column(5);
2640
2641 =head2 column_from_residue_number
2642
2643  Title   : column_from_residue_number
2644  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2645  Function: This function gives the position in the alignment
2646            (i.e. column number) of the given residue number in the
2647            sequence with the given name. For example, for the
2648            alignment
2649
2650              Seq1/91-97 AC..DEF.GH.
2651              Seq2/24-30 ACGG.RTY...
2652                 Seq3/43-51 AC.DDEF.GHI
2653
2654            column_from_residue_number( "Seq1", 94 ) returns 6.
2655            column_from_residue_number( "Seq2", 25 ) returns 2.
2656            column_from_residue_number( "Seq3", 50 ) returns 10.
2657
2658            An exception is thrown if the residue number would lie
2659            outside the length of the aligment
2660            (e.g. column_from_residue_number( "Seq2", 22 )
2661
2662           Note: If the the parent sequence is represented by more than
2663                 one alignment sequence and the residue number is present in
2664                 them, this method finds only the first one.
2665
2666  Returns : A column number for the position in the alignment of the
2667            given residue in the given sequence (1 = first column)
2668  Args    : A sequence id/name (not a name/start-end)
2669            A residue number in the whole sequence (not just that
2670            segment of it in the alignment)
2671
2672 =cut
2673
2674 sub column_from_residue_number {
2675     my ($self, $name, $resnumber) = @_;
2676
2677     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2678     $self->throw("Second argument residue number missing") unless $resnumber;
2679
2680     foreach my $seq ($self->each_seq_with_id($name)) {
2681         my $col;
2682         eval {
2683             $col = $seq->column_from_residue_number($resnumber);
2684         };
2685         next if $@;
2686         return $col;
2687     }
2688
2689     $self->throw("Could not find a sequence segment in $name ".
2690                  "containing residue number $resnumber");
2691
2692 }
2693
2694 =head1 Sequence names
2695
2696 Methods to manipulate the display name. The default name based on the
2697 sequence id and subsequence positions can be overridden in various
2698 ways.
2699
2700 =head2 displayname
2701
2702  Title     : displayname
2703  Usage     : $myalign->displayname("Ig", "IgA")
2704  Function  : Gets/sets the display name of a sequence in the alignment
2705  Returns   : A display name string
2706  Argument  : name of the sequence
2707              displayname of the sequence (optional)
2708
2709 =cut
2710
2711 sub get_displayname {
2712     my $self = shift;
2713     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2714     $self->displayname(@_);
2715 }
2716
2717 sub set_displayname {
2718     my $self = shift;
2719     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2720     $self->displayname(@_);
2721 }
2722
2723 sub displayname {
2724     my ($self, $name, $disname) = @_;
2725
2726     $self->throw("No sequence with name [$name]")
2727         unless defined $self->{'_seq'}->{$name};
2728
2729     if(  $disname and  $name) {
2730         $self->{'_dis_name'}->{$name} = $disname;
2731         return $disname;
2732     }
2733     elsif( defined $self->{'_dis_name'}->{$name} ) {
2734         return  $self->{'_dis_name'}->{$name};
2735     } else {
2736         return $name;
2737     }
2738 }
2739
2740 =head2 set_displayname_count
2741
2742  Title     : set_displayname_count
2743  Usage     : $ali->set_displayname_count
2744  Function  : Sets the names to be name_# where # is the number of
2745              times this name has been used.
2746  Returns   : 1, on success
2747  Argument  :
2748
2749 =cut
2750
2751 sub set_displayname_count {
2752     my $self= shift;
2753     my (@arr,$name,$seq,$count,$temp,$nse);
2754
2755     foreach $seq ( $self->each_alphabetically() ) {
2756         $nse = $seq->get_nse();
2757
2758         #name will be set when this is the second
2759         #time (or greater) is has been seen
2760
2761         if( defined $name and $name eq ($seq->id()) ) {
2762             $temp = sprintf("%s_%s",$name,$count);
2763             $self->displayname($nse,$temp);
2764             $count++;
2765         } else {
2766             $count = 1;
2767             $name = $seq->id();
2768             $temp = sprintf("%s_%s",$name,$count);
2769             $self->displayname($nse,$temp);
2770             $count++;
2771         }
2772     }
2773     return 1;
2774 }
2775
2776 =head2 set_displayname_flat
2777
2778  Title     : set_displayname_flat
2779  Usage     : $ali->set_displayname_flat()
2780  Function  : Makes all the sequences be displayed as just their name,
2781              not name/start-end
2782  Returns   : 1
2783  Argument  :
2784
2785 =cut
2786
2787 sub set_displayname_flat {
2788     my $self = shift;
2789     my ($nse,$seq);
2790
2791     foreach $seq ( $self->each_seq() ) {
2792         $nse = $seq->get_nse();
2793         $self->displayname($nse,$seq->id());
2794     }
2795     return 1;
2796 }
2797
2798 =head2 set_displayname_normal
2799
2800  Title     : set_displayname_normal
2801  Usage     : $ali->set_displayname_normal()
2802  Function  : Makes all the sequences be displayed as name/start-end
2803  Returns   : 1, on success
2804  Argument  :
2805
2806 =cut
2807
2808 sub set_displayname_normal {
2809     my $self = shift;
2810     my ($nse,$seq);
2811
2812     foreach $seq ( $self->each_seq() ) {
2813         $nse = $seq->get_nse();
2814         $self->displayname($nse,$nse);
2815     }
2816     return 1;
2817 }
2818
2819 =head2 source
2820
2821  Title   : source
2822  Usage   : $obj->source($newval)
2823  Function: sets the Alignment source program
2824  Example :
2825  Returns : value of source
2826  Args    : newvalue (optional)
2827
2828
2829 =cut
2830
2831 sub source{
2832    my ($self,$value) = @_;
2833    if( defined $value) {
2834       $self->{'_source'} = $value;
2835     }
2836     return $self->{'_source'};
2837 }
2838
2839 =head2 set_displayname_safe
2840
2841  Title     : set_displayname_safe
2842  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2843  Function  : Assign machine-generated serial names to sequences in input order.
2844              Designed to protect names during PHYLIP runs. Assign 10-char string
2845              in the form of "S000000001" to "S999999999". Restore the original
2846              names using "restore_displayname".
2847  Returns   : 1. a new $aln with system names;
2848              2. a hash ref for restoring names
2849  Argument  : Number for id length (default 10)
2850
2851 =cut
2852
2853 sub set_displayname_safe {
2854     my $self = shift;
2855     my $idlength = shift || 10;
2856     my ($seq, %phylip_name);
2857     my $ct=0;
2858     my $new=Bio::SimpleAlign->new();
2859     foreach $seq ( $self->each_seq() ) {
2860         $ct++;
2861         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2862         $phylip_name{$pname}=$seq->id();
2863         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2864                                             -seq      => $seq->seq(),
2865                                             -alphabet => $seq->alphabet,
2866                                             -start    => $seq->{_start},
2867                                             -end      => $seq->{_end}
2868                                             );
2869         $new->add_seq($new_seq);
2870     }
2871
2872     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2873     return ($new, \%phylip_name);
2874 }
2875
2876 =head2 restore_displayname
2877
2878  Title     : restore_displayname
2879  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2880  Function  : Restore original sequence names (after running
2881              $ali->set_displayname_safe)
2882  Returns   : a new $aln with names restored.
2883  Argument  : a hash reference of names from "set_displayname_safe".
2884
2885 =cut
2886
2887 sub restore_displayname {
2888     my $self = shift;
2889     my $ref=shift;
2890     my %name=%$ref;
2891     my $new=Bio::SimpleAlign->new();
2892     foreach my $seq ( $self->each_seq() ) {
2893       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2894       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2895                                           -seq      => $seq->seq(),
2896                                           -alphabet => $seq->alphabet,
2897                                           -start    => $seq->{_start},
2898                                           -end      => $seq->{_end}
2899                                           );
2900       $new->add_seq($new_seq);
2901     }
2902     return $new;
2903 }
2904
2905 =head2 sort_by_start
2906
2907  Title     : sort_by_start
2908  Usage     : $ali->sort_by_start
2909  Function  : Changes the order of the alignment to the start position of each
2910              subalignment
2911  Returns   :
2912  Argument  :
2913
2914 =cut
2915
2916 sub sort_by_start {
2917     my $self = shift;
2918     my ($seq,$nse,@arr,%hash,$count);
2919     foreach $seq ( $self->each_seq() ) {
2920         $nse = $seq->get_nse;
2921         $hash{$nse} = $seq;
2922     }
2923     $count = 0;
2924     %{$self->{'_order'}} = (); # reset the hash;
2925     foreach $nse ( sort _startend keys %hash) {
2926         $self->{'_order'}->{$count} = $nse;
2927         $count++;
2928     }
2929     1;
2930 }
2931
2932 sub _startend
2933 {
2934     my ($aname,$arange) = split (/[\/]/,$a);
2935     my ($bname,$brange) = split (/[\/]/,$b);
2936     my ($astart,$aend) = split(/\-/,$arange);
2937     my ($bstart,$bend) = split(/\-/,$brange);
2938     return $astart <=> $bstart;
2939 }
2940
2941 =head2 bracket_string
2942
2943  Title     : bracket_string
2944  Usage     : my @params = (-refseq     => 'testseq',
2945                            -allele1    => 'allele1',
2946                            -allele2    => 'allele2',
2947                            -delimiters => '{}',
2948                            -separator  => '/');
2949              $str = $aln->bracket_string(@params)
2950
2951  Function :  When supplied with a list of parameters (see below), returns a
2952              string in BIC format. This is used for allelic comparisons.
2953              Briefly, if either allele contains a base change when compared to
2954              the refseq, the base or gap for each allele is represented in
2955              brackets in the order present in the 'alleles' parameter.
2956
2957              For the following data:
2958
2959              >testseq
2960              GGATCCATTGCTACT
2961              >allele1
2962              GGATCCATTCCTACT
2963              >allele2
2964              GGAT--ATTCCTCCT
2965
2966              the returned string with parameters 'refseq => testseq' and
2967              'alleles => [qw(allele1 allele2)]' would be:
2968
2969              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2970  Returns   : BIC-formatted string
2971  Argument  : Required args
2972                 refseq    : string (ID) of the reference sequence used
2973                             as basis for comparison
2974                 allele1   : string (ID) of the first allele
2975                 allele2   : string (ID) of the second allele
2976              Optional args
2977                 delimiters: two symbol string of left and right delimiters.
2978                             Only the first two symbols are used
2979                             default = '[]'
2980                 separator : string used as a separator.  Only the first
2981                             symbol is used
2982                             default = '/'
2983  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2984
2985 =cut
2986
2987 sub bracket_string {
2988     my ($self, @args) = @_;
2989     my ($ref, $a1, $a2, $delim, $sep) =
2990         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2991     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2992     my ($ld, $rd);
2993     ($ld, $rd) = split('', $delim, 2) if $delim;
2994     $ld ||= '[';
2995     $rd ||= ']';
2996     $sep ||= '/';
2997     my ($refseq, $allele1, $allele2) =
2998         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2999     if (!$refseq || !$allele1 || !$allele2) {
3000         $self->throw("One of your refseq/allele IDs is invalid!");
3001     }
3002     my $len = $self->length-1;
3003     my $bic = '';
3004     # loop over the alignment columns
3005     for my $column ( 0 .. $len ) {
3006         my $string;
3007         my ($compres, $res1, $res2) =
3008             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3009         # are any of the allele symbols different from the refseq?
3010         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3011                 $ld.$res1.$sep.$res2.$rd;
3012         $bic .= $string;
3013     }
3014     return $bic;
3015 }
3016
3017
3018 =head2 methods implementing Bio::FeatureHolderI
3019
3020 FeatureHolderI implementation to support labeled character sets like one
3021 would get from NEXUS represented data.
3022
3023 =head2 get_SeqFeatures
3024
3025  Usage   : @features = $aln->get_SeqFeatures
3026  Function: Get the feature objects held by this feature holder.
3027  Example :
3028  Returns : an array of Bio::SeqFeatureI implementing objects
3029  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3030          : as argument, returning TRUE if wanted, FALSE if
3031          : unwanted
3032
3033 =cut
3034
3035 sub get_SeqFeatures {
3036     my $self = shift;
3037     my $filter_cb = shift;
3038     $self->throw("Arg (filter callback) must be a coderef") unless
3039         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
3040     if( !defined $self->{'_as_feat'} ) {
3041         $self->{'_as_feat'} = [];
3042     }
3043     if ($filter_cb) {
3044         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
3045     }
3046     return @{$self->{'_as_feat'}};
3047 }
3048
3049
3050 =head2 add_SeqFeature
3051
3052  Usage   : $aln->add_SeqFeature($subfeat);
3053  Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3054            (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3055  Example :
3056  Returns : true on success
3057  Args    : a Bio::SeqFeatureI object
3058
3059 =cut
3060
3061 sub add_SeqFeature {
3062    my ($self, @feat) = @_;
3063
3064    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3065
3066    if (scalar @feat > 1) {
3067       $self->deprecated(
3068          -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3069                      ' is deprecated and will be removed in a future version. '.
3070                      'Add a single feature at a time instead.',
3071          -warn_version    => 1.007,
3072          -throw_version   => 1.009,
3073       );
3074    }
3075
3076    for my $feat ( @feat ) {
3077
3078        next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3079
3080        if( !$feat->isa("Bio::SeqFeatureI") ) {
3081            $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3082        }
3083
3084        push @{$self->{'_as_feat'}}, $feat;
3085    }
3086    return 1;
3087 }
3088
3089
3090 =head2 remove_SeqFeatures
3091
3092  Usage   : $obj->remove_SeqFeatures
3093  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3094            remove that subset from the returned array, and add back the rest.
3095  Returns : The array of Bio::SeqFeatureI features that was
3096            deleted from this alignment.
3097  Args    : none
3098
3099 =cut
3100
3101 sub remove_SeqFeatures {
3102     my $self = shift;
3103
3104     return () unless $self->{'_as_feat'};
3105     my @feats = @{$self->{'_as_feat'}};
3106     $self->{'_as_feat'} = [];
3107     return @feats;
3108 }
3109
3110 =head2 feature_count
3111
3112  Title   : feature_count
3113  Usage   : $obj->feature_count()
3114  Function: Return the number of SeqFeatures attached to the alignment
3115  Returns : integer representing the number of SeqFeatures
3116  Args    : None
3117
3118 =cut
3119
3120 sub feature_count {
3121     my ($self) = @_;
3122
3123     if (defined($self->{'_as_feat'})) {
3124         return ($#{$self->{'_as_feat'}} + 1);
3125     } else {
3126         return 0;
3127     }
3128 }
3129
3130 =head2 get_all_SeqFeatures
3131
3132  Title   : get_all_SeqFeatures
3133  Usage   :
3134  Function: Get all SeqFeatures.
3135  Example :
3136  Returns : an array of Bio::SeqFeatureI implementing objects
3137  Args    : none
3138  Note    : Falls through to Bio::FeatureHolderI implementation.
3139
3140 =cut
3141
3142 =head2 methods for Bio::AnnotatableI
3143
3144 AnnotatableI implementation to support sequence alignments which
3145 contain annotation (NEXUS, Stockholm).
3146
3147 =head2 annotation
3148
3149  Title   : annotation
3150  Usage   : $ann = $aln->annotation or
3151            $aln->annotation($ann)
3152  Function: Gets or sets the annotation
3153  Returns : Bio::AnnotationCollectionI object
3154  Args    : None or Bio::AnnotationCollectionI object
3155
3156 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3157 for more information
3158
3159 =cut
3160
3161 sub annotation {
3162     my ($obj,$value) = @_;
3163     if( defined $value ) {
3164         $obj->throw("object of class ".ref($value)." does not implement ".
3165                 "Bio::AnnotationCollectionI. Too bad.")
3166             unless $value->isa("Bio::AnnotationCollectionI");
3167         $obj->{'_annotation'} = $value;
3168     } elsif( ! defined $obj->{'_annotation'}) {
3169         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3170     }
3171     return $obj->{'_annotation'};
3172 }
3173
3174 =head1 Deprecated methods
3175
3176 =cut
3177
3178 =head2 no_residues
3179
3180  Title     : no_residues
3181  Usage     : $no = $ali->no_residues
3182  Function  : number of residues in total in the alignment
3183  Returns   : integer
3184  Argument  :
3185  Note      : deprecated in favor of num_residues()
3186
3187 =cut
3188
3189 sub no_residues {
3190         my $self = shift;
3191         $self->deprecated(-warn_version => 1.0069,
3192                                           -throw_version => 1.0075,
3193                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3194     $self->num_residues(@_);
3195 }
3196
3197 =head2 no_sequences
3198
3199  Title     : no_sequences
3200  Usage     : $depth = $ali->no_sequences
3201  Function  : number of sequence in the sequence alignment
3202  Returns   : integer
3203  Argument  :
3204  Note      : deprecated in favor of num_sequences()
3205
3206 =cut
3207
3208 sub no_sequences {
3209         my $self = shift;
3210         $self->deprecated(-warn_version => 1.0069,
3211                                           -throw_version => 1.0075,
3212                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3213     $self->num_sequences(@_);
3214 }
3215
3216 =head2 mask_columns
3217
3218  Title     : mask_columns
3219  Usage     : $aln2 = $aln->mask_columns(20,30)
3220  Function  : Masks a slice of the alignment inclusive of start and
3221              end columns, and the first column in the alignment is denoted 1.
3222              Mask beyond the length of the sequence does not do padding.
3223  Returns   : A Bio::SimpleAlign object
3224  Args      : Positive integer for start column, positive integer for end column,
3225              optional string value use for the mask. Example:
3226
3227              $aln2 = $aln->mask_columns(20,30,'?')
3228  Note      : Masking must use a character that is not used for gaps or
3229              frameshifts.  These can be adjusted using the relevant global
3230              variables, but be aware these may be (uncontrollably) modified
3231              elsewhere within BioPerl (see bug 2715)
3232
3233 =cut
3234
3235 sub mask_columns {
3236     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3237     my $self = shift;
3238
3239     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3240              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3241
3242     # coordinates are alignment-based, not sequence-based
3243     my ($start, $end, $mask_char) = @_;
3244     unless (defined $mask_char) { $mask_char = 'N' }
3245
3246     $self->throw("Mask start has to be a positive integer and less than ".
3247                  "alignment length, not [$start]")
3248       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3249     $self->throw("Mask end has to be a positive integer and less than ".
3250                  "alignment length, not [$end]")
3251       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3252     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3253                  "end [$end]") unless $start <= $end;
3254     $self->throw("Mask character $mask_char has to be a single character ".
3255                  "and not a gap or frameshift symbol")
3256       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3257
3258     my $aln = $self->new;
3259     $aln->id($self->id);
3260     foreach my $seq ( $self->each_seq() ) {
3261         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3262          -alphabet => $seq->alphabet,
3263          -strand  => $seq->strand,
3264          -verbose => $self->verbose);
3265
3266         # convert from 1-based alignment coords!
3267         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3268         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3269         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3270         $new_seq->seq($new_dna_string);
3271         $aln->add_seq($new_seq);
3272     }
3273     return $aln;
3274 }
3275
3276
3277
3278 1;