branch-1-6/Bio/SimpleAlign.pm

   1 # $Id$
   2 # BioPerl module for SimpleAlign
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13 #
  14 #  History:
  15 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  16 #       May 2001 major rewrite - Heikki Lehvaslaiho
  17
  18 =head1 NAME
  19
  20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  21
  22 =head1 SYNOPSIS
  23
  24   # Use Bio::AlignIO to read in the alignment
  25   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  26   $aln = $str->next_aln();
  27
  28   # Describe
  29   print $aln->length;
  30   print $aln->num_residues;
  31   print $aln->is_flush;
  32   print $aln->num_sequences;
  33   print $aln->score;
  34   print $aln->percentage_identity;
  35   print $aln->consensus_string(50);
  36
  37   # Find the position in the alignment for a sequence location
  38   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  39
  40   # Extract sequences and check values for the alignment column $pos
  41   foreach $seq ($aln->each_seq) {
  42       $res = $seq->subseq($pos, $pos);
  43       $count{$res}++;
  44   }
  45   foreach $res (keys %count) {
  46       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  47   }
  48
  49   # Manipulate
  50   $aln->remove_seq($seq);
  51   $mini_aln = $aln->slice(20,30);  # get a block of columns
  52   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  53   $new_aln = $aln->remove_columns([20,30]); # remove by position
  54   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  55
  56   # Analyze
  57   $str = $aln->consensus_string($threshold_percent);
  58   $str = $aln->match_line();
  59   $str = $aln->cigar_line();
  60   $id = $aln->percentage_identity;
  61
  62   # See the module documentation for details and more methods.
  63
  64 =head1 DESCRIPTION
  65
  66 SimpleAlign is an object that handles a multiple sequence alignment
  67 (MSA). It is very permissive of types (it does not insist on sequences
  68 being all same length, for example). Think of it as a set of sequences
  69 with a whole series of built-in manipulations and methods for reading and
  70 writing alignments.
  71
  72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  73 to store its sequences. These are subsequences with a start and end
  74 positions in the parent reference sequence. Each sequence in the
  75 SimpleAlign object is a Bio::LocatableSeq.
  76
  77 SimpleAlign expects the combination of name, start, and end for a
  78 given sequence to be unique in the alignment, and this is the key for the
  79 internal hashes (name, start, end are abbreviated C<nse> in the code).
  80 However, in some cases people do not want the name/start-end to be displayed:
  81 either multiple names in an alignment or names specific to the alignment
  82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  83 C<displayname>, and generally is what is used to print out the
  84 alignment. They default to name/start-end.
  85
  86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  87
  88 =head1 FEEDBACK
  89
  90 =head2 Mailing Lists
  91
  92 User feedback is an integral part of the evolution of this and other
  93 Bioperl modules.  Send your comments and suggestions preferably to one
  94 of the Bioperl mailing lists.  Your participation is much appreciated.
  95
  96   bioperl-l@bioperl.org                  - General discussion
  97   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  98
  99 =head2 Support
 100
 101 Please direct usage questions or support issues to the mailing list:
 102
 103 I<bioperl-l@bioperl.org>
 104
 105 rather than to the module maintainer directly. Many experienced and
 106 reponsive experts will be able look at the problem and quickly
 107 address it. Please include a thorough description of the problem
 108 with code and data examples if at all possible.
 109
 110 =head2 Reporting Bugs
 111
 112 Report bugs to the Bioperl bug tracking system to help us keep track
 113 the bugs and their resolution. Bug reports can be submitted via the
 114 web:
 115
 116   http://bugzilla.open-bio.org/
 117
 118 =head1 AUTHOR
 119
 120 Ewan Birney, birney@ebi.ac.uk
 121
 122 =head1 CONTRIBUTORS
 123
 124 Allen Day, allenday-at-ucla.edu,
 125 Richard Adams, Richard.Adams-at-ed.ac.uk,
 126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 128 Allen Smith, allens-at-cpan.org,
 129 Jason Stajich, jason-at-bioperl.org,
 130 Anthony Underwood, aunderwood-at-phls.org.uk,
 131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 132 Brian Osborne, bosborne at alum.mit.edu
 133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 134 Hongyu Zhang, forward at hongyu.org
 135 Jay Hannah, jay at jays.net
 136 Alexandr Bezginov, albezg at gmail.com
 137
 138 =head1 SEE ALSO
 139
 140 L<Bio::LocatableSeq>
 141
 142 =head1 APPENDIX
 143
 144 The rest of the documentation details each of the object
 145 methods. Internal methods are usually preceded with a _
 146
 147 =cut
 148
 149 # 'Let the code begin...
 150
 151 package Bio::SimpleAlign;
 152 use vars qw(%CONSERVATION_GROUPS);
 153 use strict;
 154
 155 use Bio::LocatableSeq;  # uses Seq's as list
 156
 157 use Bio::Seq;
 158 use Bio::SeqFeature::Generic;
 159
 160 BEGIN {
 161     # This data should probably be in a more centralized module...
 162     # it is taken from Clustalw documentation.
 163     # These are all the positively scoring groups that occur in the
 164     # Gonnet Pam250 matrix. The strong and weak groups are
 165     # defined as strong score >0.5 and weak score =<0.5 respectively.
 166
 167     %CONSERVATION_GROUPS = (
 168             'strong' => [ qw(
 169                                                  STA
 170                                                  NEQK
 171                                                  NHQK
 172                                                  NDEQ
 173                                                  QHRK
 174                                                  MILV
 175                                                  MILF
 176                                                  HY
 177                                                  FYW )],
 178                                 'weak' => [ qw(
 179                       CSA
 180                                                ATV
 181                                                SAG
 182                                                STNK
 183                                                STPA
 184                                                SGND
 185                                                SNDEQK
 186                                                NDEQHK
 187                                                NEQHRK
 188                                                FVLIM
 189                                                HFY )],);
 190 }
 191
 192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 193             Bio::FeatureHolderI);
 194
 195 =head2 new
 196
 197  Title     : new
 198  Usage     : my $aln = Bio::SimpleAlign->new();
 199  Function  : Creates a new simple align object
 200  Returns   : Bio::SimpleAlign
 201  Args      : -source     => string representing the source program
 202                             where this alignment came from
 203              -annotation => Bio::AnnotationCollectionI
 204              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 205              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 206              -consensus  => consensus string
 207              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 208
 209 =cut
 210
 211
 212 sub new {
 213   my($class,@args) = @_;
 214
 215   my $self = $class->SUPER::new(@args);
 216
 217   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 218                                             SOURCE
 219                                             SCORE
 220                                             ID
 221                                             ACCESSION
 222                                             DESCRIPTION
 223                                             SEQS
 224                                             FEATURES
 225                                             ANNOTATION
 226                                             SEQ_ANNOTATION
 227                                             CONSENSUS
 228                                             CONSENSUS_META
 229                                             )], @args);
 230   $src && $self->source($src);
 231   defined $score && $self->score($score);
 232   # we need to set up internal hashs first!
 233
 234   $self->{'_seq'} = {};
 235   $self->{'_order'} = {};
 236   $self->{'_start_end_lists'} = {};
 237   $self->{'_dis_name'} = {};
 238   $self->{'_id'} = 'NoName';
 239   # maybe we should automatically read in from args. Hmmm...
 240   $id  && $self->id($id);
 241   $acc && $self->accession($acc);
 242   $desc && $self->description($desc);
 243   $coll && $self->annotation($coll);
 244   # sequence annotation is layered into a provided annotation collection (or dies)
 245   if ($sa) {
 246     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 247                  "with a sequence annotation collection")
 248         if !$coll;
 249     $coll->add_Annotation('seq_annotation', $sa);
 250   }
 251   if ($feats && ref $feats eq 'ARRAY') {
 252     for my $feat (@$feats) {
 253         $self->add_SeqFeature($feat);
 254     }
 255   }
 256   $con && $self->consensus($con);
 257   $cmeta && $self->consensus_meta($cmeta);
 258   # assumes these are in correct alignment order
 259   if ($seqs && ref($seqs) eq 'ARRAY') {
 260     for my $seq (@$seqs) {
 261         $self->add_seq($seq);
 262     }
 263   }
 264
 265   return $self; # success - we hope!
 266 }
 267
 268 =head1 Modifier methods
 269
 270 These methods modify the MSA by adding, removing or shuffling complete
 271 sequences.
 272
 273 =head2 add_seq
 274
 275  Title     : add_seq
 276  Usage     : $myalign->add_seq($newseq);
 277              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 278  Function  : Adds another sequence to the alignment. *Does not* align
 279              it - just adds it to the hashes.
 280              If -ORDER is specified, the sequence is inserted at the
 281              the position spec'd by -ORDER, and existing sequences
 282              are pushed down the storage array.
 283  Returns   : nothing
 284  Args      : A Bio::LocatableSeq object
 285              Positive integer for the sequence position (optional)
 286
 287 See L<Bio::LocatableSeq> for more information
 288
 289 =cut
 290
 291 sub addSeq {
 292     my $self = shift;
 293     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 294     $self->add_seq(@_);
 295 }
 296
 297 sub add_seq {
 298     my $self = shift;
 299     my @args = @_;
 300     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 301     my ($name,$id,$start,$end);
 302
 303     unless ($seq) {
 304         $self->throw("LocatableSeq argument required");
 305     }
 306     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 307         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 308     }
 309     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 310     $order--; # jay's patch (user-specified order is 1-origin)
 311
 312     if ($order < 0) {
 313         $self->throw("User-specified value for ORDER must be >= 1");
 314     }
 315
 316     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 317
 318     # build the symbol list for this sequence,
 319     # will prune out the gap and missing/match chars
 320     # when actually asked for the symbol list in the
 321     # symbol_chars
 322     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 323
 324     $name = $seq->get_nse;
 325
 326     if( $self->{'_seq'}->{$name} ) {
 327         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 328     }
 329     else {
 330         $self->debug( "Assigning $name to $order\n");
 331
 332     my $ordh = $self->{'_order'};
 333     if ($ordh->{$order}) {
 334         # make space to insert
 335         # $c->() returns (in reverse order) the first subsequence
 336         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 337         # (3,2,1), and $c->(2,4,5) returns (2).
 338         my $c;
 339         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 340         map {
 341      $ordh->{$_+1} = $ordh->{$_}
 342         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 343
 344     }
 345     $ordh->{$order} = $name;
 346
 347         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 348             $self->{'_start_end_lists'}->{$id} = [];
 349         }
 350         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 351     }
 352
 353     $self->{'_seq'}->{$name} = $seq;
 354
 355 }
 356
 357
 358 =head2 remove_seq
 359
 360  Title     : remove_seq
 361  Usage     : $aln->remove_seq($seq);
 362  Function  : Removes a single sequence from an alignment
 363  Returns   :
 364  Argument  : a Bio::LocatableSeq object
 365
 366 =cut
 367
 368 sub removeSeq {
 369     my $self = shift;
 370     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 371     $self->remove_seq(@_);
 372 }
 373
 374 sub remove_seq {
 375     my $self = shift;
 376     my $seq = shift;
 377     my ($name,$id,$start,$end);
 378
 379     $self->throw("Need Bio::Locatable seq argument ")
 380         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 381
 382     $id = $seq->id();
 383     $start = $seq->start();
 384     $end  = $seq->end();
 385     $name = sprintf("%s/%d-%d",$id,$start,$end);
 386
 387     if( !exists $self->{'_seq'}->{$name} ) {
 388         $self->throw("Sequence $name does not exist in the alignment to remove!");
 389     }
 390
 391     delete $self->{'_seq'}->{$name};
 392
 393     # we need to remove this seq from the start_end_lists hash
 394
 395     if (exists $self->{'_start_end_lists'}->{$id}) {
 396         # we need to find the sequence in the array.
 397
 398         my ($i, $found);;
 399         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 400             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 401                 $found = 1;
 402                 last;
 403             }
 404         }
 405         if ($found) {
 406             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 407         }
 408         else {
 409             $self->throw("Could not find the sequence to remoce from the start-end list");
 410         }
 411     }
 412     else {
 413         $self->throw("There is no seq list for the name $id");
 414     }
 415     # we need to shift order hash
 416     my %rev_order = reverse %{$self->{'_order'}};
 417     my $no = $rev_order{$name};
 418     my $num_sequences = $self->num_sequences;
 419     for (; $no < $num_sequences; $no++) {
 420        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 421     }
 422     delete $self->{'_order'}->{$no};
 423     return 1;
 424 }
 425
 426
 427 =head2 purge
 428
 429  Title   : purge
 430  Usage   : $aln->purge(0.7);
 431  Function: Removes sequences above given sequence similarity
 432            This function will grind on large alignments. Beware!
 433  Example :
 434  Returns : An array of the removed sequences
 435  Args    : float, threshold for similarity
 436
 437 =cut
 438
 439 sub purge {
 440         my ($self,$perc) = @_;
 441         my (%duplicate, @dups);
 442
 443         my @seqs = $self->each_seq();
 444
 445         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 446                 my $seq = $seqs[$i];
 447
 448                 #skip if already in duplicate hash
 449                 next if exists $duplicate{$seq->display_id} ;
 450                 my $one = $seq->seq();
 451
 452                 my @one = split '', $one;       #split to get 1aa per array element
 453
 454                 for (my $j=$i+1;$j < @seqs;$j++) {
 455                         my $seq2 = $seqs[$j];
 456
 457                         #skip if already in duplicate hash
 458                         next if exists $duplicate{$seq2->display_id} ;
 459
 460                         my $two = $seq2->seq();
 461                         my @two = split '', $two;
 462
 463                         my $count = 0;
 464                         my $res = 0;
 465                         for (my $k=0;$k<@one;$k++) {
 466                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 467                                           $one[$k] eq $two[$k]) {
 468                                         $count++;
 469                                 }
 470                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 471                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 472                                         $res++;
 473                                 }
 474                         }
 475
 476                         my $ratio = 0;
 477                         $ratio = $count/$res unless $res == 0;
 478
 479                         # if above threshold put in duplicate hash and push onto
 480                         # duplicate array for returning to get_unique
 481                         if ( $ratio > $perc ) {
 482                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 483                                 $duplicate{$seq2->display_id} = 1;
 484                                 push @dups, $seq2;
 485                         }
 486                 }
 487         }
 488         foreach my $seq (@dups) {
 489                 $self->remove_seq($seq);
 490         }
 491         return @dups;
 492 }
 493
 494 =head2 sort_alphabetically
 495
 496  Title     : sort_alphabetically
 497  Usage     : $ali->sort_alphabetically
 498  Function  : Changes the order of the alignment to alphabetical on name
 499              followed by numerical by number.
 500  Returns   :
 501  Argument  :
 502
 503 =cut
 504
 505 sub sort_alphabetically {
 506     my $self = shift;
 507     my ($seq,$nse,@arr,%hash,$count);
 508
 509     foreach $seq ( $self->each_seq() ) {
 510         $nse = $seq->get_nse;
 511         $hash{$nse} = $seq;
 512     }
 513
 514     $count = 0;
 515
 516     %{$self->{'_order'}} = (); # reset the hash;
 517
 518     foreach $nse ( sort _alpha_startend keys %hash) {
 519         $self->{'_order'}->{$count} = $nse;
 520
 521         $count++;
 522     }
 523     1;
 524 }
 525
 526 =head2 sort_by_list
 527
 528  Title     : sort_by_list
 529  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 530  Function  : Arbitrarily order sequences in an alignment
 531  Returns   : A new Bio::SimpleAlign object
 532  Argument  : a file listing sequence names in intended order (one name per line)
 533
 534 =cut
 535
 536 sub sort_by_list {
 537     my ($self, $list) = @_;
 538     my (@seq, @ids, %order);
 539
 540     foreach my $seq ( $self->each_seq() ) {
 541         push @seq, $seq;
 542         push @ids, $seq->display_id;
 543     }
 544
 545     my $ct=1;
 546     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 547     while (<$listfh>) {
 548       chomp;
 549       my $name=$_;
 550       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 551       $order{$name}=$ct++;
 552     }
 553     close($listfh);
 554
 555     # use the map-sort-map idiom:
 556     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 557     my $aln = $self->new;
 558     foreach (@sorted) { $aln->add_seq($_) }
 559     return $aln;
 560 }
 561
 562 =head2 set_new_reference
 563
 564  Title     : set_new_reference
 565  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 566              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 567              as the reference (1st) sequence
 568  Function  : Change/Set a new reference (i.e., the first) sequence
 569  Returns   : a new Bio::SimpleAlign object.
 570              Throws an exception if designated sequence not found
 571  Argument  : a positive integer of sequence order, or a sequence name
 572              in the original alignment
 573
 574 =cut
 575
 576 sub set_new_reference {
 577     my ($self, $seqid) = @_;
 578     my $aln = $self->new;
 579     my (@seq, @ids, @new_seq);
 580     my $is_num=0;
 581     foreach my $seq ( $self->each_seq() ) {
 582         push @seq, $seq;
 583         push @ids, $seq->display_id;
 584     }
 585
 586     if ($seqid =~ /^\d+$/) { # argument is seq position
 587         $is_num=1;
 588         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 589     } else { # argument is a seq name
 590         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 591     }
 592
 593     for (my $i=0; $i<=$#seq; $i++) {
 594         my $pos=$i+1;
 595         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 596             unshift @new_seq, $seq[$i];
 597         } else {
 598             push @new_seq, $seq[$i];
 599         }
 600     }
 601     foreach (@new_seq) { $aln->add_seq($_);  }
 602     return $aln;
 603 }
 604
 605 sub _in_aln {  # check if input name exists in the alignment
 606     my ($str, $ref) = @_;
 607     foreach (@$ref) {
 608         return 1 if $str eq $_;
 609     }
 610     return 0;
 611 }
 612
 613
 614 =head2 uniq_seq
 615
 616  Title     : uniq_seq
 617  Usage     : $aln->uniq_seq():  Remove identical sequences in
 618              in the alignment.  Ambiguous base ("N", "n") and
 619              leading and ending gaps ("-") are NOT counted as
 620              differences.
 621  Function  : Make a new alignment of unique sequence types (STs)
 622  Returns   : 1a. if called in a scalar context,
 623                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 624              1b. if called in an array context,
 625                 a new Bio::SimpleAlign object, and a hashref whose keys
 626                 are sequence types, and whose values are arrayrefs to
 627                 lists of sequence ids within the corresponding sequence type
 628              2. if $aln->verbose > 0, ST of each sequence is sent to
 629                 STDERR (in a tabular format)
 630  Argument  : None
 631
 632 =cut
 633
 634 sub uniq_seq {
 635     my ($self, $seqid) = @_;
 636     my $aln = $self->new;
 637     my (%member, %order, @seq, @uniq_str, $st);
 638     my $order=0;
 639     my $len = $self->length();
 640     $st = {};
 641     foreach my $seq ( $self->each_seq() ) {
 642         my $str = $seq->seq();
 643
 644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 645 # comparing two sequence strings
 646
 647     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 648         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 649     # 2nd, convert leading and ending gaps to "?":
 650         $str = &_convert_leading_ending_gaps($str, '-', '?');
 651     # Note that '?' also can mean unknown residue.
 652     # I don't like making global class member changes like this, too
 653     # prone to errors... -- cjfields 08-11-18
 654     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 655         my $new = Bio::LocatableSeq->new(
 656                      -id      => $seq->id(),
 657                                          -alphabet=> $seq->alphabet,
 658                                          -seq     => $str,
 659                                          -start   => $seq->start,
 660                                          -end     => $seq->end
 661                                          );
 662         push @seq, $new;
 663     }
 664
 665     foreach my $seq (@seq) {
 666         my $str = $seq->seq();
 667         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 668         if ($seen) { # seen before
 669             my @memb = @{$member{$key}};
 670             push @memb, $seq;
 671             $member{$key} = \@memb;
 672         } else {  # not seen
 673             push @uniq_str, $key;
 674             $order++;
 675             $member{$key} = [ ($seq) ];
 676             $order{$key} = $order;
 677         }
 678     }
 679
 680     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 682         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 684         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 685         my $gap='-';
 686         my $end=length($str2);
 687         $end -= length($1) while $str2 =~ m/($gap+)/g;
 688         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 689                                          -seq  =>$str2,
 690                                          -start=>1,
 691                                          -end  =>$end
 692                                          );
 693         $aln->add_seq($new);
 694         foreach (@{$member{$str}}) {
 695             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 696         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 697         }
 698     }
 699     return wantarray ? ($aln, $st) : $aln;
 700 }
 701
 702 sub _check_uniq {  # check if same seq exists in the alignment
 703     my ($str1, $ref, $length) = @_;
 704     my @char1=split //, $str1;
 705     my @array=@$ref;
 706
 707     return (0, $str1) if @array==0; # not seen (1st sequence)
 708
 709     foreach my $str2 (@array) {
 710         my $diff=0;
 711         my @char2=split //, $str2;
 712         for (my $i=0; $i<=$length-1; $i++) {
 713             next if $char1[$i] eq '?';
 714             next if $char2[$i] eq '?';
 715             $diff++ if $char1[$i] ne $char2[$i];
 716         }
 717         return (1, $str2) if $diff == 0;  # seen before
 718     }
 719
 720     return (0, $str1); # not seen
 721 }
 722
 723 sub _convert_leading_ending_gaps {
 724     my $s=shift;
 725     my $sym1=shift;
 726     my $sym2=shift;
 727     my @array=split //, $s;
 728 # convert leading char:
 729     for (my $i=0; $i<=$#array; $i++) {
 730         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 731     }
 732 # convert ending char:
 733     for (my $i = $#array; $i>= 0; $i--) {
 734         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 735     }
 736     my $s_new=join '', @array;
 737     return $s_new;
 738 }
 739
 740 =head1 Sequence selection methods
 741
 742 Methods returning one or more sequences objects.
 743
 744 =head2 each_seq
 745
 746  Title     : each_seq
 747  Usage     : foreach $seq ( $align->each_seq() )
 748  Function  : Gets a Seq object from the alignment
 749  Returns   : Seq object
 750  Argument  :
 751
 752 =cut
 753
 754 sub eachSeq {
 755     my $self = shift;
 756     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 757     $self->each_seq();
 758 }
 759
 760 sub each_seq {
 761         my $self = shift;
 762         my (@arr,$order);
 763
 764         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 765                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 766                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 767                 }
 768         }
 769         return @arr;
 770 }
 771
 772
 773 =head2 each_alphabetically
 774
 775  Title     : each_alphabetically
 776  Usage     : foreach $seq ( $ali->each_alphabetically() )
 777  Function  : Returns a sequence object, but the objects are returned
 778              in alphabetically sorted order.
 779              Does not change the order of the alignment.
 780  Returns   : Seq object
 781  Argument  :
 782
 783 =cut
 784
 785 sub each_alphabetically {
 786         my $self = shift;
 787         my ($seq,$nse,@arr,%hash,$count);
 788
 789         foreach $seq ( $self->each_seq() ) {
 790                 $nse = $seq->get_nse;
 791                 $hash{$nse} = $seq;
 792         }
 793
 794         foreach $nse ( sort _alpha_startend keys %hash) {
 795                 push(@arr,$hash{$nse});
 796         }
 797         return @arr;
 798 }
 799
 800 sub _alpha_startend {
 801     my ($aname,$astart,$bname,$bstart);
 802     ($aname,$astart) = split (/-/,$a);
 803     ($bname,$bstart) = split (/-/,$b);
 804
 805     if( $aname eq $bname ) {
 806         return $astart <=> $bstart;
 807     }
 808     else {
 809         return $aname cmp $bname;
 810     }
 811 }
 812
 813 =head2 each_seq_with_id
 814
 815  Title     : each_seq_with_id
 816  Usage     : foreach $seq ( $align->each_seq_with_id() )
 817  Function  : Gets a Seq objects from the alignment, the contents
 818              being those sequences with the given name (there may be
 819              more than one)
 820  Returns   : Seq object
 821  Argument  : a seq name
 822
 823 =cut
 824
 825 sub eachSeqWithId {
 826     my $self = shift;
 827     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 828     $self->each_seq_with_id(@_);
 829 }
 830
 831 sub each_seq_with_id {
 832     my $self = shift;
 833     my $id = shift;
 834
 835     $self->throw("Method each_seq_with_id needs a sequence name argument")
 836         unless defined $id;
 837
 838     my (@arr, $seq);
 839
 840     if (exists($self->{'_start_end_lists'}->{$id})) {
 841         @arr = @{$self->{'_start_end_lists'}->{$id}};
 842     }
 843     return @arr;
 844 }
 845
 846 =head2 get_seq_by_pos
 847
 848  Title     : get_seq_by_pos
 849  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 850  Function  : Gets a sequence based on its position in the alignment.
 851              Numbering starts from 1.  Sequence positions larger than
 852              num_sequences() will thow an error.
 853  Returns   : a Bio::LocatableSeq object
 854  Args      : positive integer for the sequence position
 855
 856 =cut
 857
 858 sub get_seq_by_pos {
 859
 860     my $self = shift;
 861     my ($pos) = @_;
 862
 863     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 864         unless $pos =~ /^\d+$/ and $pos > 0;
 865     $self->throw("No sequence at position [$pos]")
 866         unless $pos <= $self->num_sequences ;
 867
 868     my $nse = $self->{'_order'}->{--$pos};
 869     return $self->{'_seq'}->{$nse};
 870 }
 871
 872 =head2 get_seq_by_id
 873
 874  Title     : get_seq_by_id
 875  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 876  Function  : Gets a sequence based on its name.
 877              Sequences that do not exist will warn and return undef
 878  Returns   : a Bio::LocatableSeq object
 879  Args      : string for sequence name
 880
 881 =cut
 882
 883 sub get_seq_by_id {
 884     my ($self,$name) = @_;
 885     unless( defined $name ) {
 886       $self->warn("Must provide a sequence name");
 887       return;
 888     }
 889     for my $seq ( values %{$self->{'_seq'}} ) {
 890       if ( $seq->id eq $name) {
 891         return $seq;
 892       }
 893     }
 894     return;
 895 }
 896
 897 =head2 seq_with_features
 898
 899  Title   : seq_with_features
 900  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 901                                           -consensus => 60
 902                                           -mask =>
 903            sub { my $consensus = shift;
 904
 905                  for my $i (1..5){
 906                     my $n = 'N' x $i;
 907                     my $q = '\?' x $i;
 908                     while($consensus =~ /[^?]$q[^?]/){
 909                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 910                     }
 911                   }
 912                  return $consensus;
 913                }
 914                                          );
 915  Function: produces a Bio::Seq object by first splicing gaps from -pos
 916            (by means of a splice_by_seq_pos() call), then creating
 917            features using non-? chars (by means of a consensus_string()
 918            call with stringency -consensus).
 919  Returns : a Bio::Seq object
 920  Args    : -pos : required. sequence from which to build the Bio::Seq
 921              object
 922            -consensus : optional, defaults to consensus_string()'s
 923              default cutoff value
 924            -mask : optional, a coderef to apply to consensus_string()'s
 925              output before building features.  this may be useful for
 926              closing gaps of 1 bp by masking over them with N, for
 927              instance
 928
 929 =cut
 930
 931 sub seq_with_features{
 932    my ($self,%arg) = @_;
 933
 934    #first do the preparatory splice
 935    $self->throw("must provide a -pos argument") unless $arg{-pos};
 936    $self->splice_by_seq_pos($arg{-pos});
 937
 938    my $consensus_string = $self->consensus_string($arg{-consensus});
 939    $consensus_string = $arg{-mask}->($consensus_string)
 940          if defined($arg{-mask});
 941
 942    my(@bs,@es);
 943
 944    push @bs, 1 if $consensus_string =~ /^[^?]/;
 945
 946    while($consensus_string =~ /\?[^?]/g){
 947          push @bs, pos($consensus_string);
 948    }
 949    while($consensus_string =~ /[^?]\?/g){
 950          push @es, pos($consensus_string);
 951    }
 952
 953    push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
 954
 955    my $seq = Bio::Seq->new();
 956
 957 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 958 #                -source_tag => 'location',
 959 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 960 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 961 #                                                  );
 962 #   $seq->add_SeqFeature($rootfeature);
 963
 964    while(my $b = shift @bs){
 965          my $e = shift @es;
 966          $seq->add_SeqFeature(
 967        Bio::SeqFeature::Generic->new(
 968          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 969          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 970          -source_tag => $self->source || 'MSA',
 971        )
 972      );
 973    }
 974
 975    return $seq;
 976 }
 977
 978
 979 =head1 Create new alignments
 980
 981 The result of these methods are horizontal or vertical subsets of the
 982 current MSA.
 983
 984 =head2 select
 985
 986  Title     : select
 987  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 988  Function  : Creates a new alignment from a continuous subset of
 989              sequences.  Numbering starts from 1.  Sequence positions
 990              larger than num_sequences() will thow an error.
 991  Returns   : a Bio::SimpleAlign object
 992  Args      : positive integer for the first sequence
 993              positive integer for the last sequence to include (optional)
 994
 995 =cut
 996
 997 sub select {
 998     my $self = shift;
 999     my ($start, $end) = @_;
1000
1001     $self->throw("Select start has to be a positive integer, not [$start]")
1002         unless $start =~ /^\d+$/ and $start > 0;
1003     $self->throw("Select end has to be a positive integer, not [$end]")
1004         unless $end  =~ /^\d+$/ and $end > 0;
1005     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006         unless $start <= $end;
1007
1008     my $aln = $self->new;
1009     foreach my $pos ($start .. $end) {
1010         $aln->add_seq($self->get_seq_by_pos($pos));
1011     }
1012     $aln->id($self->id);
1013     # fix for meta, sf, ann
1014     return $aln;
1015 }
1016
1017 =head2 select_noncont
1018
1019  Title     : select_noncont
1020  Usage     : # 1st and 3rd sequences, sorted
1021              $aln2 = $aln->select_noncont(1, 3)
1022
1023              # 1st and 3rd sequences, sorted (same as first)
1024              $aln2 = $aln->select_noncont(3, 1)
1025
1026              # 1st and 3rd sequences, unsorted
1027              $aln2 = $aln->select_noncont('nosort',3, 1)
1028
1029  Function  : Creates a new alignment from a subset of sequences.  Numbering
1030              starts from 1.  Sequence positions larger than num_sequences() will
1031              throw an error.  Sorts the order added to new alignment by default,
1032              to prevent sorting pass 'nosort' as the first argument in the list.
1033  Returns   : a Bio::SimpleAlign object
1034  Args      : array of integers for the sequences.  If the string 'nosort' is
1035              passed as the first argument, the sequences will not be sorted
1036              in the new alignment but will appear in the order listed.
1037
1038 =cut
1039
1040 sub select_noncont {
1041         my $self = shift;
1042     my $nosort = 0;
1043         my (@pos) = @_;
1044     if ($pos[0] !~ m{^\d+$}) {
1045         my $sortcmd = shift @pos;
1046         if ($sortcmd eq 'nosort') {
1047             $nosort = 1;
1048         } else {
1049             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1050         }
1051     }
1052
1053     my $end = $self->num_sequences;
1054     foreach ( @pos ) {
1055                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1057         }
1058
1059         @pos = sort {$a <=> $b} @pos unless $nosort;
1060
1061         my $aln = $self->new;
1062         foreach my $p (@pos) {
1063                 $aln->add_seq($self->get_seq_by_pos($p));
1064         }
1065         $aln->id($self->id);
1066     # fix for meta, sf, ann
1067         return $aln;
1068 }
1069
1070 =head2 slice
1071
1072  Title     : slice
1073  Usage     : $aln2 = $aln->slice(20,30)
1074  Function  : Creates a slice from the alignment inclusive of start and
1075              end columns, and the first column in the alignment is denoted 1.
1076              Sequences with no residues in the slice are excluded from the
1077              new alignment and a warning is printed. Slice beyond the length of
1078              the sequence does not do padding.
1079  Returns   : A Bio::SimpleAlign object
1080  Args      : Positive integer for start column, positive integer for end column,
1081              optional boolean which if true will keep gap-only columns in the newly
1082              created slice. Example:
1083
1084              $aln2 = $aln->slice(20,30,1)
1085
1086 =cut
1087
1088 sub slice {
1089         my $self = shift;
1090         my ($start, $end, $keep_gap_only) = @_;
1091
1092         $self->throw("Slice start has to be a positive integer, not [$start]")
1093           unless $start =~ /^\d+$/ and $start > 0;
1094         $self->throw("Slice end has to be a positive integer, not [$end]")
1095           unless $end =~ /^\d+$/ and $end > 0;
1096         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097           unless $start <= $end;
1098         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099                                          "[$start] is too big.") if $start > $self->length;
1100     my $cons_meta = $self->consensus_meta;
1101         my $aln = $self->new;
1102         $aln->id($self->id);
1103         foreach my $seq ( $self->each_seq() ) {
1104             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105             Bio::Seq::Meta->new
1106         (-id      => $seq->id,
1107                  -alphabet => $seq->alphabet,
1108                  -strand  => $seq->strand,
1109                  -verbose => $self->verbose) :
1110             Bio::LocatableSeq->new
1111         (-id      => $seq->id,
1112                  -alphabet => $seq->alphabet,
1113                  -strand  => $seq->strand,
1114                  -verbose => $self->verbose);
1115
1116             # seq
1117             my $seq_end = $end;
1118             $seq_end = $seq->length if( $end > $seq->length );
1119
1120             my $slice_seq = $seq->subseq($start, $seq_end);
1121             $new_seq->seq( $slice_seq );
1122
1123             $slice_seq =~ s/\W//g;
1124
1125             if ($start > 1) {
1126             my $pre_start_seq = $seq->subseq(1, $start - 1);
1127             $pre_start_seq =~ s/\W//g;
1128             if (!defined($seq->strand)) {
1129                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1134             }
1135             } else {
1136             $new_seq->start( $seq->start);
1137             }
1138         if ($new_seq->isa('Bio::Seq::MetaI')) {
1139             for my $meta_name ($seq->meta_names) {
1140                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1141             }
1142         }
1143             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1144
1145             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1146             $aln->add_seq($new_seq);
1147             } else {
1148             if( $keep_gap_only ) {
1149                 $aln->add_seq($new_seq);
1150             } else {
1151                 my $nse = $seq->get_nse();
1152                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1153                     " Sequence excluded from the new alignment.");
1154             }
1155             }
1156         }
1157     if ($cons_meta) {
1158         my $new = Bio::Seq::Meta->new();
1159         for my $meta_name ($cons_meta->meta_names) {
1160             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1161         }
1162         $aln->consensus_meta($new);
1163     }
1164     $aln->annotation($self->annotation);
1165     # fix for meta, sf, ann
1166         return $aln;
1167 }
1168
1169 =head2 remove_columns
1170
1171  Title     : remove_columns
1172  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1173              $aln2 = $aln->remove_columns([0,0],[6,8])
1174  Function  : Creates an aligment with columns removed corresponding to
1175              the specified type or by specifying the columns by number.
1176  Returns   : Bio::SimpleAlign object
1177  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1178              'all_gaps_columns') or array ref where the referenced array
1179              contains a pair of integers that specify a range.
1180              The first column is 0
1181
1182 =cut
1183
1184 sub remove_columns {
1185     my ($self,@args) = @_;
1186     @args || $self->throw("Must supply column ranges or column types");
1187     my $aln;
1188
1189     if ($args[0][0] =~ /^[a-z_]+$/i) {
1190         $aln = $self->_remove_columns_by_type($args[0]);
1191     } elsif ($args[0][0] =~ /^\d+$/) {
1192         $aln = $self->_remove_columns_by_num(\@args);
1193     } else {
1194         $self->throw("You must pass array references to remove_columns(), not @args");
1195     }
1196     # fix for meta, sf, ann
1197     $aln;
1198 }
1199
1200
1201 =head2 remove_gaps
1202
1203  Title     : remove_gaps
1204  Usage     : $aln2 = $aln->remove_gaps
1205  Function  : Creates an aligment with gaps removed
1206  Returns   : a Bio::SimpleAlign object
1207  Args      : a gap character(optional) if none specified taken
1208                 from $self->gap_char,
1209              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1210                         indicates that only all-gaps columns should be deleted
1211
1212 Used from method L<remove_columns> in most cases. Set gap character
1213 using L<gap_char()|gap_char>.
1214
1215 =cut
1216
1217 sub remove_gaps {
1218     my ($self,$gapchar,$all_gaps_columns) = @_;
1219     my $gap_line;
1220     if ($all_gaps_columns) {
1221         $gap_line = $self->all_gap_line($gapchar);
1222     } else {
1223         $gap_line = $self->gap_line($gapchar);
1224     }
1225     my $aln = $self->new;
1226
1227     my @remove;
1228     my $length = 0;
1229     my $del_char = $gapchar || $self->gap_char;
1230     # Do the matching to get the segments to remove
1231     while ($gap_line =~ m/[$del_char]/g) {
1232         my $start = pos($gap_line)-1;
1233         $gap_line=~/\G[$del_char]+/gc;
1234         my $end = pos($gap_line)-1;
1235
1236         #have to offset the start and end for subsequent removes
1237         $start-=$length;
1238         $end  -=$length;
1239         $length += ($end-$start+1);
1240         push @remove, [$start,$end];
1241     }
1242
1243     #remove the segments
1244     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1245     # fix for meta, sf, ann
1246     return $aln;
1247 }
1248
1249
1250 sub _remove_col {
1251     my ($self,$aln,$remove) = @_;
1252     my @new;
1253
1254     my $gap = $self->gap_char;
1255
1256     # splice out the segments and create new seq
1257     foreach my $seq($self->each_seq){
1258         my $new_seq = Bio::LocatableSeq->new(
1259                                              -id      => $seq->id,
1260                                              -alphabet=> $seq->alphabet,
1261                                              -strand  => $seq->strand,
1262                                              -verbose => $self->verbose);
1263         my $sequence = $seq->seq;
1264         foreach my $pair(@{$remove}){
1265             my $start = $pair->[0];
1266             my $end   = $pair->[1];
1267             $sequence = $seq->seq unless $sequence;
1268             my $orig = $sequence;
1269             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1270             my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1271             $sequence = $head.$tail;
1272             # start
1273             unless (defined $new_seq->start) {
1274                 if ($start == 0) {
1275                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1276                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1277                 }
1278                 else {
1279                     my $start_adjust = $orig =~ /^$gap+/;
1280                     if ($start_adjust) {
1281                         $start_adjust = $+[0] == $start;
1282                     }
1283                     $new_seq->start($seq->start + $start_adjust);
1284                 }
1285             }
1286             # end
1287             if (($end + 1) >= length($orig)) {
1288                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1289                 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1290             }
1291             else {
1292                 $new_seq->end($seq->end);
1293             }
1294         }
1295
1296         if ($new_seq->end < $new_seq->start) {
1297             # we removed all columns except for gaps: set to 0 to indicate no
1298             # sequence
1299             $new_seq->start(0);
1300             $new_seq->end(0);
1301         }
1302
1303         $new_seq->seq($sequence) if $sequence;
1304                 push @new, $new_seq;
1305     }
1306     # add the new seqs to the alignment
1307     foreach my $new(@new){
1308         $aln->add_seq($new);
1309     }
1310     # fix for meta, sf, ann
1311     return $aln;
1312 }
1313
1314 sub _remove_columns_by_type {
1315         my ($self,$type) = @_;
1316         my $aln = $self->new;
1317         my @remove;
1318
1319         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1320         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1321         my %matchchars = ( 'match'           => '\*',
1322                        'weak'             => '\.',
1323                        'strong'           => ':',
1324                        'mismatch'         => ' ',
1325                        'gaps'             => '',
1326                        'all_gaps_columns' => ''
1327                      );
1328         # get the characters to delete against
1329         my $del_char;
1330         foreach my $type (@{$type}){
1331                 $del_char.= $matchchars{$type};
1332         }
1333
1334         my $length = 0;
1335         my $match_line = $self->match_line;
1336         # do the matching to get the segments to remove
1337         if($del_char){
1338                 while($match_line =~ m/[$del_char]/g ){
1339                         my $start = pos($match_line)-1;
1340                         $match_line=~/\G[$del_char]+/gc;
1341                         my $end = pos($match_line)-1;
1342
1343                         #have to offset the start and end for subsequent removes
1344                         $start-=$length;
1345                         $end  -=$length;
1346                         $length += ($end-$start+1);
1347                         push @remove, [$start,$end];
1348                 }
1349         }
1350
1351         # remove the segments
1352         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1353         $aln = $aln->remove_gaps() if $gap;
1354         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1355     # fix for meta, sf, ann
1356         $aln;
1357 }
1358
1359
1360 sub _remove_columns_by_num {
1361         my ($self,$positions) = @_;
1362         my $aln = $self->new;
1363
1364         # sort the positions
1365         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1366
1367     my @remove;
1368     my $length = 0;
1369     foreach my $pos (@{$positions}) {
1370         my ($start, $end) = @{$pos};
1371
1372         #have to offset the start and end for subsequent removes
1373         $start-=$length;
1374         $end  -=$length;
1375         $length += ($end-$start+1);
1376         push @remove, [$start,$end];
1377     }
1378
1379     #remove the segments
1380     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1381     # fix for meta, sf, ann
1382         $aln;
1383 }
1384
1385
1386 =head1 Change sequences within the MSA
1387
1388 These methods affect characters in all sequences without changing the
1389 alignment.
1390
1391 =head2 splice_by_seq_pos
1392
1393  Title   : splice_by_seq_pos
1394  Usage   : $status = splice_by_seq_pos(1);
1395  Function: splices all aligned sequences where the specified sequence
1396            has gaps.
1397  Example :
1398  Returns : 1 on success
1399  Args    : position of sequence to splice by
1400
1401
1402 =cut
1403
1404 sub splice_by_seq_pos{
1405   my ($self,$pos) = @_;
1406
1407   my $guide = $self->get_seq_by_pos($pos);
1408   my $guide_seq = $guide->seq;
1409
1410   $guide_seq =~ s/\./\-/g;
1411
1412   my @gaps = ();
1413   $pos = -1;
1414   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1415     unshift @gaps, $pos;
1416     $pos++;
1417   }
1418
1419   foreach my $seq ($self->each_seq){
1420     my @bases = split '', $seq->seq;
1421
1422     splice(@bases, $_, 1) foreach @gaps;
1423     $seq->seq(join('', @bases));
1424   }
1425
1426   1;
1427 }
1428
1429 =head2 map_chars
1430
1431  Title     : map_chars
1432  Usage     : $ali->map_chars('\.','-')
1433  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1434              characters
1435
1436              Notice that the from (arg1) is interpretted as a regex,
1437              so be careful about quoting meta characters (eg
1438              $ali->map_chars('.','-') wont do what you want)
1439  Returns   :
1440  Argument  : 'from' rexexp
1441              'to' string
1442
1443 =cut
1444
1445 sub map_chars {
1446     my $self = shift;
1447     my $from = shift;
1448     my $to   = shift;
1449     my ($seq,$temp);
1450
1451     $self->throw("Need exactly two arguments")
1452         unless defined $from and defined $to;
1453
1454     foreach $seq ( $self->each_seq() ) {
1455         $temp = $seq->seq();
1456         $temp =~ s/$from/$to/g;
1457         $seq->seq($temp);
1458     }
1459     return 1;
1460 }
1461
1462
1463 =head2 uppercase
1464
1465  Title     : uppercase()
1466  Usage     : $ali->uppercase()
1467  Function  : Sets all the sequences to uppercase
1468  Returns   :
1469  Argument  :
1470
1471 =cut
1472
1473 sub uppercase {
1474     my $self = shift;
1475     my $seq;
1476     my $temp;
1477
1478     foreach $seq ( $self->each_seq() ) {
1479       $temp = $seq->seq();
1480       $temp =~ tr/[a-z]/[A-Z]/;
1481
1482       $seq->seq($temp);
1483     }
1484     return 1;
1485 }
1486
1487 =head2 cigar_line
1488
1489  Title    : cigar_line()
1490  Usage    : %cigars = $align->cigar_line()
1491  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1492             Report) line for each sequence in the alignment. Examples are
1493             "1,60" or "5,10:12,58", where the numbers refer to conserved
1494             positions within the alignment. The keys of the hash are the
1495             NSEs (name/start/end) assigned to each sequence.
1496  Args     : threshold (optional, defaults to 100)
1497  Returns  : Hash of strings (cigar lines)
1498
1499 =cut
1500
1501 sub cigar_line {
1502         my $self = shift;
1503         my $thr=shift||100;
1504         my %cigars;
1505
1506         my @consensus = split "",($self->consensus_string($thr));
1507         my $len = $self->length;
1508         my $gapchar = $self->gap_char;
1509
1510         # create a precursor, something like (1,4,5,6,7,33,45),
1511         # where each number corresponds to a conserved position
1512         foreach my $seq ( $self->each_seq ) {
1513                 my @seq = split "", uc ($seq->seq);
1514                 my $pos = 1;
1515                 for (my $x = 0 ; $x < $len ; $x++ ) {
1516                         if ($seq[$x] eq $consensus[$x]) {
1517                                 push @{$cigars{$seq->get_nse}},$pos;
1518                                 $pos++;
1519                         } elsif ($seq[$x] ne $gapchar) {
1520                                 $pos++;
1521                         }
1522                 }
1523         }
1524         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1525         for my $name (keys %cigars) {
1526                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1527                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1528       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1529            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1530                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1531                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1532                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1533                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1534                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1535                         }
1536       }
1537         }
1538   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1539   for my $name (keys %cigars) {
1540           my @remove;
1541           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1542                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1543                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1544                       unshift @remove,$x;
1545               }
1546            }
1547       for my $pos (@remove) {
1548                         splice @{$cigars{$name}}, $pos, 1;
1549            }
1550    }
1551    # join and punctuate
1552    for my $name (keys %cigars) {
1553           my ($start,$end,$str) = "";
1554           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1555                   $str .= ($start . "," . $end . ":");
1556           }
1557           $str =~ s/:$//;
1558       $cigars{$name} = $str;
1559    }
1560    %cigars;
1561 }
1562
1563
1564 =head2 match_line
1565
1566  Title    : match_line()
1567  Usage    : $line = $align->match_line()
1568  Function : Generates a match line - much like consensus string
1569             except that a line indicating the '*' for a match.
1570  Args     : (optional) Match line characters ('*' by default)
1571             (optional) Strong match char (':' by default)
1572             (optional) Weak match char ('.' by default)
1573  Returns  : String
1574
1575 =cut
1576
1577 sub match_line {
1578         my ($self,$matchlinechar, $strong, $weak) = @_;
1579         my %matchchars = ('match'    => $matchlinechar || '*',
1580                                                           'weak'     => $weak          || '.',
1581                                                           'strong'   => $strong        || ':',
1582                                                           'mismatch' => ' ',
1583                                                   );
1584
1585         my @seqchars;
1586         my $alphabet;
1587         foreach my $seq ( $self->each_seq ) {
1588                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1589                 $alphabet = $seq->alphabet unless defined $alphabet;
1590         }
1591         my $refseq = shift @seqchars;
1592         # let's just march down the columns
1593         my $matchline;
1594  POS:
1595         foreach my $pos ( 0..$self->length ) {
1596                 my $refchar = $refseq->[$pos];
1597                 my $char = $matchchars{'mismatch'};
1598                 unless( defined $refchar ) {
1599                         last if $pos == $self->length; # short circuit on last residue
1600                         # this in place to handle jason's soon-to-be-committed
1601                         # intron mapping code
1602                         goto bottom;
1603                 }
1604                 my %col = ($refchar => 1);
1605                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1606                 foreach my $seq ( @seqchars ) {
1607                         next if $pos >= scalar @$seq;
1608                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1609                                                           $seq->[$pos] eq ' ' );
1610                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1611                 }
1612                 my @colresidues = sort keys %col;
1613
1614                 # if all the values are the same
1615                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1616                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1617                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1618                         # matches for protein seqs
1619             TYPE:
1620                         foreach my $type ( qw(strong weak) ) {
1621                                 # iterate through categories
1622                                 my %groups;
1623                                 # iterate through each of the aa in the col
1624                                 # look to see which groups it is in
1625                                 foreach my $c ( @colresidues ) {
1626                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1627                                                 push @{$groups{$f}},$c;
1628                                         }
1629                                 }
1630                          GRP:
1631                                 foreach my $cols ( values %groups ) {
1632                                         @$cols = sort @$cols;
1633                                         # now we are just testing to see if two arrays
1634                                         # are identical w/o changing either one
1635                                         # have to be same len
1636                                         next if( scalar @$cols != scalar @colresidues );
1637                                         # walk down the length and check each slot
1638                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1639                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1640                                         }
1641                                         $char = $matchchars{$type};
1642                                         last TYPE;
1643                                 }
1644                         }
1645                 }
1646          bottom:
1647                 $matchline .= $char;
1648         }
1649         return $matchline;
1650 }
1651
1652
1653 =head2 gap_line
1654
1655  Title    : gap_line()
1656  Usage    : $line = $align->gap_line()
1657  Function : Generates a gap line - much like consensus string
1658             except that a line where '-' represents gap
1659  Args     : (optional) gap line characters ('-' by default)
1660  Returns  : string
1661
1662 =cut
1663
1664 sub gap_line {
1665     my ($self,$gapchar) = @_;
1666     $gapchar = $gapchar || $self->gap_char;
1667     my %gap_hsh; # column gaps vector
1668     foreach my $seq ( $self->each_seq ) {
1669                 my $i = 0;
1670         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1671                   map {[$i++, $_]} split(//, uc ($seq->seq));
1672     }
1673     my $gap_line;
1674     foreach my $pos ( 0..$self->length-1 ) {
1675           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1676     }
1677     return $gap_line;
1678 }
1679
1680 =head2 all_gap_line
1681
1682  Title    : all_gap_line()
1683  Usage    : $line = $align->all_gap_line()
1684  Function : Generates a gap line - much like consensus string
1685             except that a line where '-' represents all-gap column
1686  Args     : (optional) gap line characters ('-' by default)
1687  Returns  : string
1688
1689 =cut
1690
1691 sub all_gap_line {
1692     my ($self,$gapchar) = @_;
1693     $gapchar = $gapchar || $self->gap_char;
1694     my %gap_hsh;                # column gaps counter hash
1695     my @seqs = $self->each_seq;
1696     foreach my $seq ( @seqs ) {
1697         my $i = 0;
1698         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1699         map {[$i++, $_]} split(//, uc ($seq->seq));
1700     }
1701     my $gap_line;
1702     foreach my $pos ( 0..$self->length-1 ) {
1703         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1704             # gaps column
1705             $gap_line .= $gapchar;
1706         } else {
1707             $gap_line .= '.';
1708         }
1709     }
1710     return $gap_line;
1711 }
1712
1713 =head2 gap_col_matrix
1714
1715  Title    : gap_col_matrix()
1716  Usage    : my $cols = $align->gap_col_matrix()
1717  Function : Generates an array of hashes where
1718             each entry in the array is a hash reference
1719             with keys of all the sequence names and
1720             and value of 1 or 0 if the sequence has a gap at that column
1721  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1722
1723 =cut
1724
1725 sub gap_col_matrix {
1726     my ($self,$gapchar) = @_;
1727     $gapchar = $gapchar || $self->gap_char;
1728     my %gap_hsh; # column gaps vector
1729     my @cols;
1730     foreach my $seq ( $self->each_seq ) {
1731         my $i = 0;
1732         my $str = $seq->seq;
1733         my $len = $seq->length;
1734         my $ch;
1735         my $id = $seq->display_id;
1736         while( $i < $len ) {
1737             $ch = substr($str, $i, 1);
1738             $cols[$i++]->{$id} = ($ch eq $gapchar);
1739         }
1740     }
1741     return \@cols;
1742 }
1743
1744 =head2 match
1745
1746  Title     : match()
1747  Usage     : $ali->match()
1748  Function  : Goes through all columns and changes residues that are
1749              identical to residue in first sequence to match '.'
1750              character. Sets match_char.
1751
1752              USE WITH CARE: Most MSA formats do not support match
1753              characters in sequences, so this is mostly for output
1754              only. NEXUS format (Bio::AlignIO::nexus) can handle
1755              it.
1756  Returns   : 1
1757  Argument  : a match character, optional, defaults to '.'
1758
1759 =cut
1760
1761 sub match {
1762     my ($self, $match) = @_;
1763
1764     $match ||= '.';
1765     my ($matching_char) = $match;
1766     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1767     $self->map_chars($matching_char, '-');
1768
1769     my @seqs = $self->each_seq();
1770     return 1 unless scalar @seqs > 1;
1771
1772     my $refseq = shift @seqs ;
1773     my @refseq = split //, $refseq->seq;
1774     my $gapchar = $self->gap_char;
1775
1776     foreach my $seq ( @seqs ) {
1777         my @varseq = split //, $seq->seq();
1778         for ( my $i=0; $i < scalar @varseq; $i++) {
1779             $varseq[$i] = $match if defined $refseq[$i] &&
1780                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1781                   $refseq[$i] =~ /$gapchar/ )
1782                       && $refseq[$i] eq $varseq[$i];
1783         }
1784         $seq->seq(join '', @varseq);
1785     }
1786     $self->match_char($match);
1787     return 1;
1788 }
1789
1790
1791 =head2 unmatch
1792
1793  Title     : unmatch()
1794  Usage     : $ali->unmatch()
1795  Function  : Undoes the effect of method match. Unsets match_char.
1796  Returns   : 1
1797  Argument  : a match character, optional, defaults to '.'
1798
1799 See L<match> and L<match_char>
1800
1801 =cut
1802
1803 sub unmatch {
1804     my ($self, $match) = @_;
1805
1806     $match ||= '.';
1807
1808     my @seqs = $self->each_seq();
1809     return 1 unless scalar @seqs > 1;
1810
1811     my $refseq = shift @seqs ;
1812     my @refseq = split //, $refseq->seq;
1813     my $gapchar = $self->gap_char;
1814     foreach my $seq ( @seqs ) {
1815         my @varseq = split //, $seq->seq();
1816         for ( my $i=0; $i < scalar @varseq; $i++) {
1817             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1818                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1819                   $refseq[$i] =~ /$gapchar/ ) &&
1820                       $varseq[$i] eq $match;
1821         }
1822         $seq->seq(join '', @varseq);
1823     }
1824     $self->match_char('');
1825     return 1;
1826 }
1827
1828 =head1 MSA attributes
1829
1830 Methods for setting and reading the MSA attributes.
1831
1832 Note that the methods defining character semantics depend on the user
1833 to set them sensibly.  They are needed only by certain input/output
1834 methods. Unset them by setting to an empty string ('').
1835
1836 =head2 id
1837
1838  Title     : id
1839  Usage     : $myalign->id("Ig")
1840  Function  : Gets/sets the id field of the alignment
1841  Returns   : An id string
1842  Argument  : An id string (optional)
1843
1844 =cut
1845
1846 sub id {
1847     my ($self, $name) = @_;
1848
1849     if (defined( $name )) {
1850         $self->{'_id'} = $name;
1851     }
1852
1853     return $self->{'_id'};
1854 }
1855
1856 =head2 accession
1857
1858  Title     : accession
1859  Usage     : $myalign->accession("PF00244")
1860  Function  : Gets/sets the accession field of the alignment
1861  Returns   : An acc string
1862  Argument  : An acc string (optional)
1863
1864 =cut
1865
1866 sub accession {
1867     my ($self, $acc) = @_;
1868
1869     if (defined( $acc )) {
1870         $self->{'_accession'} = $acc;
1871     }
1872
1873     return $self->{'_accession'};
1874 }
1875
1876 =head2 description
1877
1878  Title     : description
1879  Usage     : $myalign->description("14-3-3 proteins")
1880  Function  : Gets/sets the description field of the alignment
1881  Returns   : An description string
1882  Argument  : An description string (optional)
1883
1884 =cut
1885
1886 sub description {
1887     my ($self, $name) = @_;
1888
1889     if (defined( $name )) {
1890         $self->{'_description'} = $name;
1891     }
1892
1893     return $self->{'_description'};
1894 }
1895
1896 =head2 missing_char
1897
1898  Title     : missing_char
1899  Usage     : $myalign->missing_char("?")
1900  Function  : Gets/sets the missing_char attribute of the alignment
1901              It is generally recommended to set it to 'n' or 'N'
1902              for nucleotides and to 'X' for protein.
1903  Returns   : An missing_char string,
1904  Argument  : An missing_char string (optional)
1905
1906 =cut
1907
1908 sub missing_char {
1909     my ($self, $char) = @_;
1910
1911     if (defined $char ) {
1912         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1913         $self->{'_missing_char'} = $char;
1914     }
1915
1916     return $self->{'_missing_char'};
1917 }
1918
1919 =head2 match_char
1920
1921  Title     : match_char
1922  Usage     : $myalign->match_char('.')
1923  Function  : Gets/sets the match_char attribute of the alignment
1924  Returns   : An match_char string,
1925  Argument  : An match_char string (optional)
1926
1927 =cut
1928
1929 sub match_char {
1930     my ($self, $char) = @_;
1931
1932     if (defined $char ) {
1933         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1934         $self->{'_match_char'} = $char;
1935     }
1936
1937     return $self->{'_match_char'};
1938 }
1939
1940 =head2 gap_char
1941
1942  Title     : gap_char
1943  Usage     : $myalign->gap_char('-')
1944  Function  : Gets/sets the gap_char attribute of the alignment
1945  Returns   : An gap_char string, defaults to '-'
1946  Argument  : An gap_char string (optional)
1947
1948 =cut
1949
1950 sub gap_char {
1951     my ($self, $char) = @_;
1952
1953     if (defined $char || ! defined $self->{'_gap_char'} ) {
1954         $char= '-' unless defined $char;
1955         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1956         $self->{'_gap_char'} = $char;
1957     }
1958     return $self->{'_gap_char'};
1959 }
1960
1961 =head2 symbol_chars
1962
1963  Title   : symbol_chars
1964  Usage   : my @symbolchars = $aln->symbol_chars;
1965  Function: Returns all the seen symbols (other than gaps)
1966  Returns : array of characters that are the seen symbols
1967  Args    : boolean to include the gap/missing/match characters
1968
1969 =cut
1970
1971 sub symbol_chars{
1972    my ($self,$includeextra) = @_;
1973
1974    unless ($self->{'_symbols'}) {
1975        foreach my $seq ($self->each_seq) {
1976            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1977        }
1978    }
1979    my %copy = %{$self->{'_symbols'}};
1980    if( ! $includeextra ) {
1981        foreach my $char ( $self->gap_char, $self->match_char,
1982                           $self->missing_char) {
1983            delete $copy{$char} if( defined $char );
1984        }
1985    }
1986    return keys %copy;
1987 }
1988
1989 =head1 Alignment descriptors
1990
1991 These read only methods describe the MSA in various ways.
1992
1993
1994 =head2 score
1995
1996  Title     : score
1997  Usage     : $str = $ali->score()
1998  Function  : get/set a score of the alignment
1999  Returns   : a score for the alignment
2000  Argument  : an optional score to set
2001
2002 =cut
2003
2004 sub score {
2005   my $self = shift;
2006   $self->{score} = shift if @_;
2007   return $self->{score};
2008 }
2009
2010 =head2 consensus_string
2011
2012  Title     : consensus_string
2013  Usage     : $str = $ali->consensus_string($threshold_percent)
2014  Function  : Makes a strict consensus
2015  Returns   : Consensus string
2016  Argument  : Optional treshold ranging from 0 to 100.
2017              The consensus residue has to appear at least threshold %
2018              of the sequences at a given location, otherwise a '?'
2019              character will be placed at that location.
2020              (Default value = 0%)
2021
2022 =cut
2023
2024 sub consensus_string {
2025     my $self = shift;
2026     my $threshold = shift;
2027
2028     my $out = "";
2029     my $len = $self->length - 1;
2030
2031     foreach ( 0 .. $len ) {
2032         $out .= $self->_consensus_aa($_,$threshold);
2033     }
2034     return $out;
2035 }
2036
2037 sub _consensus_aa {
2038     my $self = shift;
2039     my $point = shift;
2040     my $threshold_percent = shift || -1 ;
2041     my ($seq,%hash,$count,$letter,$key);
2042     my $gapchar = $self->gap_char;
2043     foreach $seq ( $self->each_seq() ) {
2044         $letter = substr($seq->seq,$point,1);
2045         $self->throw("--$point-----------") if $letter eq '';
2046         ($letter eq $gapchar || $letter =~ /\./) && next;
2047         # print "Looking at $letter\n";
2048         $hash{$letter}++;
2049     }
2050     my $number_of_sequences = $self->num_sequences();
2051     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2052     $count = -1;
2053     $letter = '?';
2054
2055     foreach $key ( sort keys %hash ) {
2056         # print "Now at $key $hash{$key}\n";
2057         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2058             $letter = $key;
2059             $count = $hash{$key};
2060         }
2061     }
2062     return $letter;
2063 }
2064
2065
2066 =head2 consensus_iupac
2067
2068  Title     : consensus_iupac
2069  Usage     : $str = $ali->consensus_iupac()
2070  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2071              and RNA. The output is in upper case except when gaps in
2072              a column force output to be in lower case.
2073
2074              Note that if your alignment sequences contain a lot of
2075              IUPAC ambiquity codes you often have to manually set
2076              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2077              indicate a protein sequence.
2078  Returns   : consensus string
2079  Argument  : none
2080  Throws    : on protein sequences
2081
2082 =cut
2083
2084 sub consensus_iupac {
2085     my $self = shift;
2086     my $out = "";
2087     my $len = $self->length-1;
2088
2089     # only DNA and RNA sequences are valid
2090     foreach my $seq ( $self->each_seq() ) {
2091         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2092             if $seq->alphabet eq 'protein';
2093     }
2094     # loop over the alignment columns
2095     foreach my $count ( 0 .. $len ) {
2096         $out .= $self->_consensus_iupac($count);
2097     }
2098     return $out;
2099 }
2100
2101 sub _consensus_iupac {
2102     my ($self, $column) = @_;
2103     my ($string, $char, $rna);
2104
2105     #determine all residues in a column
2106     foreach my $seq ( $self->each_seq() ) {
2107         $string .= substr($seq->seq, $column, 1);
2108     }
2109     $string = uc $string;
2110
2111     # quick exit if there's an N in the string
2112     if ($string =~ /N/) {
2113         $string =~ /\W/ ? return 'n' : return 'N';
2114     }
2115     # ... or if there are only gap characters
2116     return '-' if $string =~ /^\W+$/;
2117
2118     # treat RNA as DNA in regexps
2119     if ($string =~ /U/) {
2120         $string =~ s/U/T/;
2121         $rna = 1;
2122     }
2123
2124     # the following s///'s only need to be done to the _first_ ambiguity code
2125     # as we only need to see the _range_ of characters in $string
2126
2127     if ($string =~ /[VDHB]/) {
2128         $string =~ s/V/AGC/;
2129         $string =~ s/D/AGT/;
2130         $string =~ s/H/ACT/;
2131         $string =~ s/B/CTG/;
2132     }
2133
2134     if ($string =~ /[SKYRWM]/) {
2135         $string =~ s/S/GC/;
2136         $string =~ s/K/GT/;
2137         $string =~ s/Y/CT/;
2138         $string =~ s/R/AG/;
2139         $string =~ s/W/AT/;
2140         $string =~ s/M/AC/;
2141     }
2142
2143     # and now the guts of the thing
2144
2145     if ($string =~ /A/) {
2146         $char = 'A';                     # A                      A
2147         if ($string =~ /G/) {
2148             $char = 'R';                 # A and G (purines)      R
2149             if ($string =~ /C/) {
2150                 $char = 'V';             # A and G and C          V
2151                 if ($string =~ /T/) {
2152                     $char = 'N';         # A and G and C and T    N
2153                 }
2154             } elsif ($string =~ /T/) {
2155                 $char = 'D';             # A and G and T          D
2156             }
2157         } elsif ($string =~ /C/) {
2158             $char = 'M';                 # A and C                M
2159             if ($string =~ /T/) {
2160                 $char = 'H';             # A and C and T          H
2161             }
2162         } elsif ($string =~ /T/) {
2163             $char = 'W';                 # A and T                W
2164         }
2165     } elsif ($string =~ /C/) {
2166         $char = 'C';                     # C                      C
2167         if ($string =~ /T/) {
2168             $char = 'Y';                 # C and T (pyrimidines)  Y
2169             if ($string =~ /G/) {
2170                 $char = 'B';             # C and T and G          B
2171             }
2172         } elsif ($string =~ /G/) {
2173             $char = 'S';                 # C and G                S
2174         }
2175     } elsif ($string =~ /G/) {
2176         $char = 'G';                     # G                      G
2177         if ($string =~ /C/) {
2178             $char = 'S';                 # G and C                S
2179         } elsif ($string =~ /T/) {
2180             $char = 'K';                 # G and T                K
2181         }
2182     } elsif ($string =~ /T/) {
2183         $char = 'T';                     # T                      T
2184     }
2185
2186     $char = 'U' if $rna and $char eq 'T';
2187     $char = lc $char if $string =~ /\W/;
2188
2189     return $char;
2190 }
2191
2192
2193 =head2 consensus_meta
2194
2195  Title     : consensus_meta
2196  Usage     : $seqmeta = $ali->consensus_meta()
2197  Function  : Returns a Bio::Seq::Meta object containing the consensus
2198              strings derived from meta data analysis.
2199  Returns   : Bio::Seq::Meta
2200  Argument  : Bio::Seq::Meta
2201  Throws    : non-MetaI object
2202
2203 =cut
2204
2205 sub consensus_meta {
2206     my ($self, $meta) = @_;
2207     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2208         $self->throw('Not a Bio::Seq::MetaI object');
2209     }
2210     return $self->{'_aln_meta'} = $meta if $meta;
2211     return $self->{'_aln_meta'}
2212 }
2213
2214 =head2 is_flush
2215
2216  Title     : is_flush
2217  Usage     : if ( $ali->is_flush() )
2218  Function  : Tells you whether the alignment
2219            : is flush, i.e. all of the same length
2220  Returns   : 1 or 0
2221  Argument  :
2222
2223 =cut
2224
2225 sub is_flush {
2226     my ($self,$report) = @_;
2227     my $seq;
2228     my $length = (-1);
2229     my $temp;
2230
2231     foreach $seq ( $self->each_seq() ) {
2232         if( $length == (-1) ) {
2233             $length = CORE::length($seq->seq());
2234             next;
2235         }
2236
2237         $temp = CORE::length($seq->seq());
2238         if( $temp != $length ) {
2239             $self->warn("expecting $length not $temp from ".
2240                         $seq->display_id) if( $report );
2241             $self->debug("expecting $length not $temp from ".
2242                          $seq->display_id);
2243             $self->debug($seq->seq(). "\n");
2244             return 0;
2245         }
2246     }
2247
2248     return 1;
2249 }
2250
2251
2252 =head2 length
2253
2254  Title     : length()
2255  Usage     : $len = $ali->length()
2256  Function  : Returns the maximum length of the alignment.
2257              To be sure the alignment is a block, use is_flush
2258  Returns   : Integer
2259  Argument  :
2260
2261 =cut
2262
2263 sub length_aln {
2264     my $self = shift;
2265     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2266     $self->length(@_);
2267 }
2268
2269 sub length {
2270     my $self = shift;
2271     my $seq;
2272     my $length = -1;
2273     my $temp;
2274
2275     foreach $seq ( $self->each_seq() ) {
2276         $temp = $seq->length();
2277         if( $temp > $length ) {
2278             $length = $temp;
2279         }
2280     }
2281
2282     return $length;
2283 }
2284
2285
2286 =head2 maxdisplayname_length
2287
2288  Title     : maxdisplayname_length
2289  Usage     : $ali->maxdisplayname_length()
2290  Function  : Gets the maximum length of the displayname in the
2291              alignment. Used in writing out various MSA formats.
2292  Returns   : integer
2293  Argument  :
2294
2295 =cut
2296
2297 sub maxname_length {
2298     my $self = shift;
2299     $self->deprecated("maxname_length - deprecated method.".
2300                       " Use maxdisplayname_length() instead.");
2301     $self->maxdisplayname_length();
2302 }
2303
2304 sub maxnse_length {
2305     my $self = shift;
2306     $self->deprecated("maxnse_length - deprecated method.".
2307                       " Use maxnse_length() instead.");
2308     $self->maxdisplayname_length();
2309 }
2310
2311 sub maxdisplayname_length {
2312     my $self = shift;
2313     my $maxname = (-1);
2314     my ($seq,$len);
2315
2316     foreach $seq ( $self->each_seq() ) {
2317         $len = CORE::length $self->displayname($seq->get_nse());
2318
2319         if( $len > $maxname ) {
2320             $maxname = $len;
2321         }
2322     }
2323
2324     return $maxname;
2325 }
2326
2327 =head2 max_metaname_length
2328
2329  Title     : max_metaname_length
2330  Usage     : $ali->max_metaname_length()
2331  Function  : Gets the maximum length of the meta name tags in the
2332              alignment for the sequences and for the alignment.
2333              Used in writing out various MSA formats.
2334  Returns   : integer
2335  Argument  : None
2336
2337 =cut
2338
2339 sub max_metaname_length {
2340     my $self = shift;
2341     my $maxname = (-1);
2342     my ($seq,$len);
2343
2344     # check seq meta first
2345     for $seq ( $self->each_seq() ) {
2346         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2347         for my $mtag ($seq->meta_names) {
2348             $len = CORE::length $mtag;
2349             if( $len > $maxname ) {
2350                 $maxname = $len;
2351             }
2352         }
2353     }
2354
2355     # alignment meta
2356     for my $meta ($self->consensus_meta) {
2357         next unless $meta;
2358         for my $name ($meta->meta_names) {
2359             $len = CORE::length $name;
2360             if( $len > $maxname ) {
2361                 $maxname = $len;
2362             }
2363         }
2364     }
2365
2366     return $maxname;
2367 }
2368
2369 =head2 num_residues
2370
2371  Title     : num_residues
2372  Usage     : $no = $ali->num_residues
2373  Function  : number of residues in total in the alignment
2374  Returns   : integer
2375  Argument  :
2376  Note      : replaces no_residues()
2377
2378 =cut
2379
2380 sub num_residues {
2381     my $self = shift;
2382     my $count = 0;
2383
2384     foreach my $seq ($self->each_seq) {
2385         my $str = $seq->seq();
2386
2387         $count += ($str =~ s/[A-Za-z]//g);
2388     }
2389
2390     return $count;
2391 }
2392
2393 =head2 num_sequences
2394
2395  Title     : num_sequences
2396  Usage     : $depth = $ali->num_sequences
2397  Function  : number of sequence in the sequence alignment
2398  Returns   : integer
2399  Argument  : none
2400  Note      : replaces no_sequences()
2401
2402 =cut
2403
2404 sub num_sequences {
2405     my $self = shift;
2406     return scalar($self->each_seq);
2407 }
2408
2409
2410 =head2 average_percentage_identity
2411
2412  Title   : average_percentage_identity
2413  Usage   : $id = $align->average_percentage_identity
2414  Function: The function uses a fast method to calculate the average
2415            percentage identity of the alignment
2416  Returns : The average percentage identity of the alignment
2417  Args    : None
2418  Notes   : This method implemented by Kevin Howe calculates a figure that is
2419            designed to be similar to the average pairwise identity of the
2420            alignment (identical in the absence of gaps), without having to
2421            explicitly calculate pairwise identities proposed by Richard Durbin.
2422            Validated by Ewan Birney ad Alex Bateman.
2423
2424 =cut
2425
2426 sub average_percentage_identity{
2427    my ($self,@args) = @_;
2428
2429    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2430                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2431
2432    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2433
2434    if (! $self->is_flush()) {
2435        $self->throw("All sequences in the alignment must be the same length");
2436    }
2437
2438    @seqs = $self->each_seq();
2439    $len = $self->length();
2440
2441    # load the each hash with correct keys for existence checks
2442
2443    for( my $index=0; $index < $len; $index++) {
2444        foreach my $letter (@alphabet) {
2445            $countHashes[$index]->{$letter} = 0;
2446        }
2447    }
2448    foreach my $seq (@seqs)  {
2449        my @seqChars = split //, $seq->seq();
2450        for( my $column=0; $column < @seqChars; $column++ ) {
2451            my $char = uc($seqChars[$column]);
2452            if (exists $countHashes[$column]->{$char}) {
2453                $countHashes[$column]->{$char}++;
2454            }
2455        }
2456    }
2457
2458    $total = 0;
2459    $divisor = 0;
2460    for(my $column =0; $column < $len; $column++) {
2461        my %hash = %{$countHashes[$column]};
2462        $subdivisor = 0;
2463        foreach my $res (keys %hash) {
2464            $total += $hash{$res}*($hash{$res} - 1);
2465            $subdivisor += $hash{$res};
2466        }
2467        $divisor += $subdivisor * ($subdivisor - 1);
2468    }
2469    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2470 }
2471
2472 =head2 percentage_identity
2473
2474  Title   : percentage_identity
2475  Usage   : $id = $align->percentage_identity
2476  Function: The function calculates the average percentage identity
2477            (aliased to average_percentage_identity)
2478  Returns : The average percentage identity
2479  Args    : None
2480
2481 =cut
2482
2483 sub percentage_identity {
2484     my $self = shift;
2485     return $self->average_percentage_identity();
2486 }
2487
2488 =head2 overall_percentage_identity
2489
2490  Title   : overall_percentage_identity
2491  Usage   : $id = $align->overall_percentage_identity
2492            $id = $align->overall_percentage_identity('short')
2493  Function: The function calculates the percentage identity of
2494            the conserved columns
2495  Returns : The percentage identity of the conserved columns
2496  Args    : length value to use, optional defaults to alignment length
2497                  possible values: 'align', 'short', 'long'
2498
2499 The argument values 'short' and 'long' refer to shortest and longest
2500 sequence in the alignment. Method modification code by Hongyu Zhang.
2501
2502 =cut
2503
2504 sub overall_percentage_identity{
2505    my ($self, $length_measure) = @_;
2506
2507    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2508                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2509
2510    my ($len, $total, @seqs, @countHashes);
2511
2512    my %enum = map {$_ => 1} qw (align short long);
2513
2514    $self->throw("Unknown argument [$length_measure]")
2515        if $length_measure and not $enum{$length_measure};
2516    $length_measure ||= 'align';
2517
2518    if (! $self->is_flush()) {
2519        $self->throw("All sequences in the alignment must be the same length");
2520    }
2521
2522    @seqs = $self->each_seq();
2523    $len = $self->length();
2524
2525    # load the each hash with correct keys for existence checks
2526    for( my $index=0; $index < $len; $index++) {
2527        foreach my $letter (@alphabet) {
2528            $countHashes[$index]->{$letter} = 0;
2529        }
2530    }
2531    foreach my $seq (@seqs)  {
2532        my @seqChars = split //, $seq->seq();
2533        for( my $column=0; $column < @seqChars; $column++ ) {
2534            my $char = uc($seqChars[$column]);
2535            if (exists $countHashes[$column]->{$char}) {
2536                $countHashes[$column]->{$char}++;
2537            }
2538        }
2539    }
2540
2541    $total = 0;
2542    for(my $column =0; $column < $len; $column++) {
2543        my %hash = %{$countHashes[$column]};
2544        foreach ( values %hash ) {
2545            next if( $_ == 0 );
2546            $total++ if( $_ == scalar @seqs );
2547            last;
2548        }
2549    }
2550
2551    if ($length_measure eq 'short') {
2552        ## find the shortest length
2553        $len = 0;
2554        foreach my $seq ($self->each_seq) {
2555            my $count = $seq->seq =~ tr/[A-Za-z]//;
2556            if ($len) {
2557                $len = $count if $count < $len;
2558            } else {
2559                $len = $count;
2560            }
2561        }
2562    }
2563    elsif ($length_measure eq 'long') {
2564        ## find the longest length
2565        $len = 0;
2566        foreach my $seq ($self->each_seq) {
2567            my $count = $seq->seq =~ tr/[A-Za-z]//;
2568            if ($len) {
2569                $len = $count if $count > $len;
2570            } else {
2571                $len = $count;
2572            }
2573        }
2574    }
2575
2576    return ($total / $len ) * 100.0;
2577 }
2578
2579
2580
2581 =head1 Alignment positions
2582
2583 Methods to map a sequence position into an alignment column and back.
2584 column_from_residue_number() does the former. The latter is really a
2585 property of the sequence object and can done using
2586 L<Bio::LocatableSeq::location_from_column>:
2587
2588     # select somehow a sequence from the alignment, e.g.
2589     my $seq = $aln->get_seq_by_pos(1);
2590     #$loc is undef or Bio::LocationI object
2591     my $loc = $seq->location_from_column(5);
2592
2593 =head2 column_from_residue_number
2594
2595  Title   : column_from_residue_number
2596  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2597  Function: This function gives the position in the alignment
2598            (i.e. column number) of the given residue number in the
2599            sequence with the given name. For example, for the
2600            alignment
2601
2602              Seq1/91-97 AC..DEF.GH.
2603              Seq2/24-30 ACGG.RTY...
2604                 Seq3/43-51 AC.DDEF.GHI
2605
2606            column_from_residue_number( "Seq1", 94 ) returns 6.
2607            column_from_residue_number( "Seq2", 25 ) returns 2.
2608            column_from_residue_number( "Seq3", 50 ) returns 10.
2609
2610            An exception is thrown if the residue number would lie
2611            outside the length of the aligment
2612            (e.g. column_from_residue_number( "Seq2", 22 )
2613
2614           Note: If the the parent sequence is represented by more than
2615                 one alignment sequence and the residue number is present in
2616                 them, this method finds only the first one.
2617
2618  Returns : A column number for the position in the alignment of the
2619            given residue in the given sequence (1 = first column)
2620  Args    : A sequence id/name (not a name/start-end)
2621            A residue number in the whole sequence (not just that
2622            segment of it in the alignment)
2623
2624 =cut
2625
2626 sub column_from_residue_number {
2627     my ($self, $name, $resnumber) = @_;
2628
2629     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2630     $self->throw("Second argument residue number missing") unless $resnumber;
2631
2632     foreach my $seq ($self->each_seq_with_id($name)) {
2633         my $col;
2634         eval {
2635             $col = $seq->column_from_residue_number($resnumber);
2636         };
2637         next if $@;
2638         return $col;
2639     }
2640
2641     $self->throw("Could not find a sequence segment in $name ".
2642                  "containing residue number $resnumber");
2643
2644 }
2645
2646 =head1 Sequence names
2647
2648 Methods to manipulate the display name. The default name based on the
2649 sequence id and subsequence positions can be overridden in various
2650 ways.
2651
2652 =head2 displayname
2653
2654  Title     : displayname
2655  Usage     : $myalign->displayname("Ig", "IgA")
2656  Function  : Gets/sets the display name of a sequence in the alignment
2657  Returns   : A display name string
2658  Argument  : name of the sequence
2659              displayname of the sequence (optional)
2660
2661 =cut
2662
2663 sub get_displayname {
2664     my $self = shift;
2665     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2666     $self->displayname(@_);
2667 }
2668
2669 sub set_displayname {
2670     my $self = shift;
2671     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2672     $self->displayname(@_);
2673 }
2674
2675 sub displayname {
2676     my ($self, $name, $disname) = @_;
2677
2678     $self->throw("No sequence with name [$name]")
2679         unless defined $self->{'_seq'}->{$name};
2680
2681     if(  $disname and  $name) {
2682         $self->{'_dis_name'}->{$name} = $disname;
2683         return $disname;
2684     }
2685     elsif( defined $self->{'_dis_name'}->{$name} ) {
2686         return  $self->{'_dis_name'}->{$name};
2687     } else {
2688         return $name;
2689     }
2690 }
2691
2692 =head2 set_displayname_count
2693
2694  Title     : set_displayname_count
2695  Usage     : $ali->set_displayname_count
2696  Function  : Sets the names to be name_# where # is the number of
2697              times this name has been used.
2698  Returns   : 1, on success
2699  Argument  :
2700
2701 =cut
2702
2703 sub set_displayname_count {
2704     my $self= shift;
2705     my (@arr,$name,$seq,$count,$temp,$nse);
2706
2707     foreach $seq ( $self->each_alphabetically() ) {
2708         $nse = $seq->get_nse();
2709
2710         #name will be set when this is the second
2711         #time (or greater) is has been seen
2712
2713         if( defined $name and $name eq ($seq->id()) ) {
2714             $temp = sprintf("%s_%s",$name,$count);
2715             $self->displayname($nse,$temp);
2716             $count++;
2717         } else {
2718             $count = 1;
2719             $name = $seq->id();
2720             $temp = sprintf("%s_%s",$name,$count);
2721             $self->displayname($nse,$temp);
2722             $count++;
2723         }
2724     }
2725     return 1;
2726 }
2727
2728 =head2 set_displayname_flat
2729
2730  Title     : set_displayname_flat
2731  Usage     : $ali->set_displayname_flat()
2732  Function  : Makes all the sequences be displayed as just their name,
2733              not name/start-end
2734  Returns   : 1
2735  Argument  :
2736
2737 =cut
2738
2739 sub set_displayname_flat {
2740     my $self = shift;
2741     my ($nse,$seq);
2742
2743     foreach $seq ( $self->each_seq() ) {
2744         $nse = $seq->get_nse();
2745         $self->displayname($nse,$seq->id());
2746     }
2747     return 1;
2748 }
2749
2750 =head2 set_displayname_normal
2751
2752  Title     : set_displayname_normal
2753  Usage     : $ali->set_displayname_normal()
2754  Function  : Makes all the sequences be displayed as name/start-end
2755  Returns   : 1, on success
2756  Argument  :
2757
2758 =cut
2759
2760 sub set_displayname_normal {
2761     my $self = shift;
2762     my ($nse,$seq);
2763
2764     foreach $seq ( $self->each_seq() ) {
2765         $nse = $seq->get_nse();
2766         $self->displayname($nse,$nse);
2767     }
2768     return 1;
2769 }
2770
2771 =head2 source
2772
2773  Title   : source
2774  Usage   : $obj->source($newval)
2775  Function: sets the Alignment source program
2776  Example :
2777  Returns : value of source
2778  Args    : newvalue (optional)
2779
2780
2781 =cut
2782
2783 sub source{
2784    my ($self,$value) = @_;
2785    if( defined $value) {
2786       $self->{'_source'} = $value;
2787     }
2788     return $self->{'_source'};
2789 }
2790
2791 =head2 set_displayname_safe
2792
2793  Title     : set_displayname_safe
2794  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2795  Function  : Assign machine-generated serial names to sequences in input order.
2796              Designed to protect names during PHYLIP runs. Assign 10-char string
2797              in the form of "S000000001" to "S999999999". Restore the original
2798              names using "restore_displayname".
2799  Returns   : 1. a new $aln with system names;
2800              2. a hash ref for restoring names
2801  Argument  : Number for id length (default 10)
2802
2803 =cut
2804
2805 sub set_displayname_safe {
2806     my $self = shift;
2807     my $idlength = shift || 10;
2808     my ($seq, %phylip_name);
2809     my $ct=0;
2810     my $new=Bio::SimpleAlign->new();
2811     foreach $seq ( $self->each_seq() ) {
2812         $ct++;
2813         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2814         $phylip_name{$pname}=$seq->id();
2815         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2816                                             -seq      => $seq->seq(),
2817                                             -alphabet => $seq->alphabet,
2818                                             -start    => $seq->{_start},
2819                                             -end      => $seq->{_end}
2820                                             );
2821         $new->add_seq($new_seq);
2822     }
2823
2824     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2825     return ($new, \%phylip_name);
2826 }
2827
2828 =head2 restore_displayname
2829
2830  Title     : restore_displayname
2831  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2832  Function  : Restore original sequence names (after running
2833              $ali->set_displayname_safe)
2834  Returns   : a new $aln with names restored.
2835  Argument  : a hash reference of names from "set_displayname_safe".
2836
2837 =cut
2838
2839 sub restore_displayname {
2840     my $self = shift;
2841     my $ref=shift;
2842     my %name=%$ref;
2843     my $new=Bio::SimpleAlign->new();
2844     foreach my $seq ( $self->each_seq() ) {
2845       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2846       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2847                                           -seq      => $seq->seq(),
2848                                           -alphabet => $seq->alphabet,
2849                                           -start    => $seq->{_start},
2850                                           -end      => $seq->{_end}
2851                                           );
2852       $new->add_seq($new_seq);
2853     }
2854     return $new;
2855 }
2856
2857 =head2 sort_by_start
2858
2859  Title     : sort_by_start
2860  Usage     : $ali->sort_by_start
2861  Function  : Changes the order of the alignment to the start position of each
2862              subalignment
2863  Returns   :
2864  Argument  :
2865
2866 =cut
2867
2868 sub sort_by_start {
2869     my $self = shift;
2870     my ($seq,$nse,@arr,%hash,$count);
2871     foreach $seq ( $self->each_seq() ) {
2872         $nse = $seq->get_nse;
2873         $hash{$nse} = $seq;
2874     }
2875     $count = 0;
2876     %{$self->{'_order'}} = (); # reset the hash;
2877     foreach $nse ( sort _startend keys %hash) {
2878         $self->{'_order'}->{$count} = $nse;
2879         $count++;
2880     }
2881     1;
2882 }
2883
2884 sub _startend
2885 {
2886     my ($aname,$arange) = split (/[\/]/,$a);
2887     my ($bname,$brange) = split (/[\/]/,$b);
2888     my ($astart,$aend) = split(/\-/,$arange);
2889     my ($bstart,$bend) = split(/\-/,$brange);
2890     return $astart <=> $bstart;
2891 }
2892
2893 =head2 bracket_string
2894
2895  Title     : bracket_string
2896  Usage     : my @params = (-refseq     => 'testseq',
2897                            -allele1    => 'allele1',
2898                            -allele2    => 'allele2',
2899                            -delimiters => '{}',
2900                            -separator  => '/');
2901              $str = $aln->bracket_string(@params)
2902
2903  Function :  When supplied with a list of parameters (see below), returns a
2904              string in BIC format. This is used for allelic comparisons.
2905              Briefly, if either allele contains a base change when compared to
2906              the refseq, the base or gap for each allele is represented in
2907              brackets in the order present in the 'alleles' parameter.
2908
2909              For the following data:
2910
2911              >testseq
2912              GGATCCATTGCTACT
2913              >allele1
2914              GGATCCATTCCTACT
2915              >allele2
2916              GGAT--ATTCCTCCT
2917
2918              the returned string with parameters 'refseq => testseq' and
2919              'alleles => [qw(allele1 allele2)]' would be:
2920
2921              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2922  Returns   : BIC-formatted string
2923  Argument  : Required args
2924                 refseq    : string (ID) of the reference sequence used
2925                             as basis for comparison
2926                 allele1   : string (ID) of the first allele
2927                 allele2   : string (ID) of the second allele
2928              Optional args
2929                 delimiters: two symbol string of left and right delimiters.
2930                             Only the first two symbols are used
2931                             default = '[]'
2932                 separator : string used as a separator.  Only the first
2933                             symbol is used
2934                             default = '/'
2935  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2936
2937 =cut
2938
2939 sub bracket_string {
2940     my ($self, @args) = @_;
2941     my ($ref, $a1, $a2, $delim, $sep) =
2942         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2943     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2944     my ($ld, $rd);
2945     ($ld, $rd) = split('', $delim, 2) if $delim;
2946     $ld ||= '[';
2947     $rd ||= ']';
2948     $sep ||= '/';
2949     my ($refseq, $allele1, $allele2) =
2950         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2951     if (!$refseq || !$allele1 || !$allele2) {
2952         $self->throw("One of your refseq/allele IDs is invalid!");
2953     }
2954     my $len = $self->length-1;
2955     my $bic = '';
2956     # loop over the alignment columns
2957     for my $column ( 0 .. $len ) {
2958         my $string;
2959         my ($compres, $res1, $res2) =
2960             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2961         # are any of the allele symbols different from the refseq?
2962         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2963                 $ld.$res1.$sep.$res2.$rd;
2964         $bic .= $string;
2965     }
2966     return $bic;
2967 }
2968
2969
2970 =head2 methods implementing Bio::FeatureHolderI
2971
2972 FeatureHolderI implementation to support labeled character sets like one
2973 would get from NEXUS represented data.
2974
2975 =head2 get_SeqFeatures
2976
2977  Usage   : @features = $aln->get_SeqFeatures
2978  Function: Get the feature objects held by this feature holder.
2979  Example :
2980  Returns : an array of Bio::SeqFeatureI implementing objects
2981  Args    : optional filter coderef, taking a Bio::SeqFeatureI
2982          : as argument, returning TRUE if wanted, FALSE if
2983          : unwanted
2984
2985 =cut
2986
2987 sub get_SeqFeatures {
2988     my $self = shift;
2989     my $filter_cb = shift;
2990     $self->throw("Arg (filter callback) must be a coderef") unless
2991         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2992     if( !defined $self->{'_as_feat'} ) {
2993         $self->{'_as_feat'} = [];
2994     }
2995     if ($filter_cb) {
2996         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2997     }
2998     return @{$self->{'_as_feat'}};
2999 }
3000
3001 =head2 add_SeqFeature
3002
3003  Usage   : $aln->add_SeqFeature($subfeat);
3004  Function: adds a SeqFeature into the SeqFeature array.
3005  Example :
3006  Returns : true on success
3007  Args    : a Bio::SeqFeatureI object
3008  Note    : This implementation is not compliant
3009            with Bio::FeatureHolderI
3010
3011 =cut
3012
3013 sub add_SeqFeature {
3014    my ($self,@feat) = @_;
3015
3016    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3017
3018    foreach my $feat ( @feat ) {
3019        if( !$feat->isa("Bio::SeqFeatureI") ) {
3020            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3021        }
3022
3023        push(@{$self->{'_as_feat'}},$feat);
3024    }
3025    return 1;
3026 }
3027
3028
3029 =head2 remove_SeqFeatures
3030
3031  Usage   : $obj->remove_SeqFeatures
3032  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3033            remove that subset from the returned array, and add back the rest.
3034  Returns : The array of Bio::SeqFeatureI features that was
3035            deleted from this alignment.
3036  Args    : none
3037
3038 =cut
3039
3040 sub remove_SeqFeatures {
3041     my $self = shift;
3042
3043     return () unless $self->{'_as_feat'};
3044     my @feats = @{$self->{'_as_feat'}};
3045     $self->{'_as_feat'} = [];
3046     return @feats;
3047 }
3048
3049 =head2 feature_count
3050
3051  Title   : feature_count
3052  Usage   : $obj->feature_count()
3053  Function: Return the number of SeqFeatures attached to the alignment
3054  Returns : integer representing the number of SeqFeatures
3055  Args    : None
3056
3057 =cut
3058
3059 sub feature_count {
3060     my ($self) = @_;
3061
3062     if (defined($self->{'_as_feat'})) {
3063         return ($#{$self->{'_as_feat'}} + 1);
3064     } else {
3065         return 0;
3066     }
3067 }
3068
3069 =head2 get_all_SeqFeatures
3070
3071  Title   : get_all_SeqFeatures
3072  Usage   :
3073  Function: Get all SeqFeatures.
3074  Example :
3075  Returns : an array of Bio::SeqFeatureI implementing objects
3076  Args    : none
3077  Note    : Falls through to Bio::FeatureHolderI implementation.
3078
3079 =cut
3080
3081 =head2 methods for Bio::AnnotatableI
3082
3083 AnnotatableI implementation to support sequence alignments which
3084 contain annotation (NEXUS, Stockholm).
3085
3086 =head2 annotation
3087
3088  Title   : annotation
3089  Usage   : $ann = $aln->annotation or
3090            $aln->annotation($ann)
3091  Function: Gets or sets the annotation
3092  Returns : Bio::AnnotationCollectionI object
3093  Args    : None or Bio::AnnotationCollectionI object
3094
3095 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3096 for more information
3097
3098 =cut
3099
3100 sub annotation {
3101     my ($obj,$value) = @_;
3102     if( defined $value ) {
3103         $obj->throw("object of class ".ref($value)." does not implement ".
3104                 "Bio::AnnotationCollectionI. Too bad.")
3105             unless $value->isa("Bio::AnnotationCollectionI");
3106         $obj->{'_annotation'} = $value;
3107     } elsif( ! defined $obj->{'_annotation'}) {
3108         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3109     }
3110     return $obj->{'_annotation'};
3111 }
3112
3113 =head1 Deprecated methods
3114
3115 =cut
3116
3117 =head2 no_residues
3118
3119  Title     : no_residues
3120  Usage     : $no = $ali->no_residues
3121  Function  : number of residues in total in the alignment
3122  Returns   : integer
3123  Argument  :
3124  Note      : deprecated in favor of num_residues()
3125
3126 =cut
3127
3128 sub no_residues {
3129         my $self = shift;
3130         $self->deprecated(-warn_version => 1.0069,
3131                                           -throw_version => 1.0075,
3132                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3133     $self->num_residues(@_);
3134 }
3135
3136 =head2 no_sequences
3137
3138  Title     : no_sequences
3139  Usage     : $depth = $ali->no_sequences
3140  Function  : number of sequence in the sequence alignment
3141  Returns   : integer
3142  Argument  :
3143  Note      : deprecated in favor of num_sequences()
3144
3145 =cut
3146
3147 sub no_sequences {
3148         my $self = shift;
3149         $self->deprecated(-warn_version => 1.0069,
3150                                           -throw_version => 1.0075,
3151                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3152     $self->num_sequences(@_);
3153 }
3154
3155 =head2 mask_columns
3156
3157  Title     : mask_columns
3158  Usage     : $aln2 = $aln->mask_columns(20,30)
3159  Function  : Masks a slice of the alignment inclusive of start and
3160              end columns, and the first column in the alignment is denoted 1.
3161              Mask beyond the length of the sequence does not do padding.
3162  Returns   : A Bio::SimpleAlign object
3163  Args      : Positive integer for start column, positive integer for end column,
3164              optional string value use for the mask. Example:
3165
3166              $aln2 = $aln->mask_columns(20,30,'?')
3167  Note      : Masking must use a character that is not used for gaps or
3168              frameshifts.  These can be adjusted using the relevant global
3169              variables, but be aware these may be (uncontrollably) modified
3170              elsewhere within BioPerl (see bug 2715)
3171
3172 =cut
3173
3174 sub mask_columns {
3175     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3176     my $self = shift;
3177
3178     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3179              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3180
3181     # coordinates are alignment-based, not sequence-based
3182     my ($start, $end, $mask_char) = @_;
3183     unless (defined $mask_char) { $mask_char = 'N' }
3184
3185     $self->throw("Mask start has to be a positive integer and less than ".
3186                  "alignment length, not [$start]")
3187       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3188     $self->throw("Mask end has to be a positive integer and less than ".
3189                  "alignment length, not [$end]")
3190       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3191     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3192                  "end [$end]") unless $start <= $end;
3193     $self->throw("Mask character $mask_char has to be a single character ".
3194                  "and not a gap or frameshift symbol")
3195       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3196
3197     my $aln = $self->new;
3198     $aln->id($self->id);
3199     foreach my $seq ( $self->each_seq() ) {
3200         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3201          -alphabet => $seq->alphabet,
3202          -strand  => $seq->strand,
3203          -verbose => $self->verbose);
3204
3205         # convert from 1-based alignment coords!
3206         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3207         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3208         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3209         $new_seq->seq($new_dna_string);
3210         $aln->add_seq($new_seq);
3211     }
3212     return $aln;
3213 }
3214
3215
3216
3217 1;