Bio/SimpleAlign.pm

   1 # BioPerl module for SimpleAlign
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12 #
  13 #  History:
  14 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  15 #       May 2001 major rewrite - Heikki Lehvaslaiho
  16
  17 =head1 NAME
  18
  19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  20
  21 =head1 SYNOPSIS
  22
  23   # Use Bio::AlignIO to read in the alignment
  24   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  25   $aln = $str->next_aln();
  26
  27   # Describe
  28   print $aln->length;
  29   print $aln->num_residues;
  30   print $aln->is_flush;
  31   print $aln->num_sequences;
  32   print $aln->score;
  33   print $aln->percentage_identity;
  34   print $aln->consensus_string(50);
  35
  36   # Find the position in the alignment for a sequence location
  37   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  38
  39   # Extract sequences and check values for the alignment column $pos
  40   foreach $seq ($aln->each_seq) {
  41       $res = $seq->subseq($pos, $pos);
  42       $count{$res}++;
  43   }
  44   foreach $res (keys %count) {
  45       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  46   }
  47
  48   # Manipulate
  49   $aln->remove_seq($seq);
  50   $mini_aln = $aln->slice(20,30);  # get a block of columns
  51   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  52   $new_aln = $aln->remove_columns([20,30]); # remove by position
  53   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  54
  55   # Analyze
  56   $str = $aln->consensus_string($threshold_percent);
  57   $str = $aln->match_line();
  58   $str = $aln->cigar_line();
  59   $id = $aln->percentage_identity;
  60
  61   # See the module documentation for details and more methods.
  62
  63 =head1 DESCRIPTION
  64
  65 SimpleAlign is an object that handles a multiple sequence alignment
  66 (MSA). It is very permissive of types (it does not insist on sequences
  67 being all same length, for example). Think of it as a set of sequences
  68 with a whole series of built-in manipulations and methods for reading and
  69 writing alignments.
  70
  71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  72 to store its sequences. These are subsequences with a start and end
  73 positions in the parent reference sequence. Each sequence in the
  74 SimpleAlign object is a Bio::LocatableSeq.
  75
  76 SimpleAlign expects the combination of name, start, and end for a
  77 given sequence to be unique in the alignment, and this is the key for the
  78 internal hashes (name, start, end are abbreviated C<nse> in the code).
  79 However, in some cases people do not want the name/start-end to be displayed:
  80 either multiple names in an alignment or names specific to the alignment
  81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  82 C<displayname>, and generally is what is used to print out the
  83 alignment. They default to name/start-end.
  84
  85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules.  Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 I<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution. Bug reports can be submitted via the
 113 web:
 114
 115   http://bugzilla.open-bio.org/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@ebi.ac.uk
 120
 121 =head1 CONTRIBUTORS
 122
 123 Allen Day, allenday-at-ucla.edu,
 124 Richard Adams, Richard.Adams-at-ed.ac.uk,
 125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 127 Allen Smith, allens-at-cpan.org,
 128 Jason Stajich, jason-at-bioperl.org,
 129 Anthony Underwood, aunderwood-at-phls.org.uk,
 130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 131 Brian Osborne, bosborne at alum.mit.edu
 132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 133 Hongyu Zhang, forward at hongyu.org
 134 Jay Hannah, jay at jays.net
 135 Alexandr Bezginov, albezg at gmail.com
 136
 137 =head1 SEE ALSO
 138
 139 L<Bio::LocatableSeq>
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148 # 'Let the code begin...
 149
 150 package Bio::SimpleAlign;
 151 use vars qw(%CONSERVATION_GROUPS);
 152 use strict;
 153
 154 use Bio::LocatableSeq;  # uses Seq's as list
 155
 156 use Bio::Seq;
 157 use Bio::SeqFeature::Generic;
 158
 159 BEGIN {
 160     # This data should probably be in a more centralized module...
 161     # it is taken from Clustalw documentation.
 162     # These are all the positively scoring groups that occur in the
 163     # Gonnet Pam250 matrix. The strong and weak groups are
 164     # defined as strong score >0.5 and weak score =<0.5 respectively.
 165
 166     %CONSERVATION_GROUPS = (
 167             'strong' => [ qw(
 168                                                  STA
 169                                                  NEQK
 170                                                  NHQK
 171                                                  NDEQ
 172                                                  QHRK
 173                                                  MILV
 174                                                  MILF
 175                                                  HY
 176                                                  FYW )],
 177                                 'weak' => [ qw(
 178                       CSA
 179                                                ATV
 180                                                SAG
 181                                                STNK
 182                                                STPA
 183                                                SGND
 184                                                SNDEQK
 185                                                NDEQHK
 186                                                NEQHRK
 187                                                FVLIM
 188                                                HFY )],);
 189 }
 190
 191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 192             Bio::FeatureHolderI);
 193
 194 =head2 new
 195
 196  Title     : new
 197  Usage     : my $aln = Bio::SimpleAlign->new();
 198  Function  : Creates a new simple align object
 199  Returns   : Bio::SimpleAlign
 200  Args      : -source     => string representing the source program
 201                             where this alignment came from
 202              -annotation => Bio::AnnotationCollectionI
 203              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 204              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 205              -consensus  => consensus string
 206              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 207
 208 =cut
 209
 210
 211 sub new {
 212   my($class,@args) = @_;
 213
 214   my $self = $class->SUPER::new(@args);
 215
 216   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 217                                             SOURCE
 218                                             SCORE
 219                                             ID
 220                                             ACCESSION
 221                                             DESCRIPTION
 222                                             SEQS
 223                                             FEATURES
 224                                             ANNOTATION
 225                                             SEQ_ANNOTATION
 226                                             CONSENSUS
 227                                             CONSENSUS_META
 228                                             )], @args);
 229   $src && $self->source($src);
 230   defined $score && $self->score($score);
 231   # we need to set up internal hashs first!
 232
 233   $self->{'_seq'} = {};
 234   $self->{'_order'} = {};
 235   $self->{'_start_end_lists'} = {};
 236   $self->{'_dis_name'} = {};
 237   $self->{'_id'} = 'NoName';
 238   # maybe we should automatically read in from args. Hmmm...
 239   $id  && $self->id($id);
 240   $acc && $self->accession($acc);
 241   $desc && $self->description($desc);
 242   $coll && $self->annotation($coll);
 243   # sequence annotation is layered into a provided annotation collection (or dies)
 244   if ($sa) {
 245     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 246                  "with a sequence annotation collection")
 247         if !$coll;
 248     $coll->add_Annotation('seq_annotation', $sa);
 249   }
 250   if ($feats && ref $feats eq 'ARRAY') {
 251     for my $feat (@$feats) {
 252         $self->add_SeqFeature($feat);
 253     }
 254   }
 255   $con && $self->consensus($con);
 256   $cmeta && $self->consensus_meta($cmeta);
 257   # assumes these are in correct alignment order
 258   if ($seqs && ref($seqs) eq 'ARRAY') {
 259     for my $seq (@$seqs) {
 260         $self->add_seq($seq);
 261     }
 262   }
 263
 264   return $self; # success - we hope!
 265 }
 266
 267 =head1 Modifier methods
 268
 269 These methods modify the MSA by adding, removing or shuffling complete
 270 sequences.
 271
 272 =head2 add_seq
 273
 274  Title     : add_seq
 275  Usage     : $myalign->add_seq($newseq);
 276              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 277  Function  : Adds another sequence to the alignment. *Does not* align
 278              it - just adds it to the hashes.
 279              If -ORDER is specified, the sequence is inserted at the
 280              the position spec'd by -ORDER, and existing sequences
 281              are pushed down the storage array.
 282  Returns   : nothing
 283  Args      : A Bio::LocatableSeq object
 284              Positive integer for the sequence position (optional)
 285
 286 See L<Bio::LocatableSeq> for more information
 287
 288 =cut
 289
 290 sub addSeq {
 291     my $self = shift;
 292     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 293     $self->add_seq(@_);
 294 }
 295
 296 sub add_seq {
 297     my $self = shift;
 298     my @args = @_;
 299     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 300     my ($name,$id,$start,$end);
 301
 302     unless ($seq) {
 303         $self->throw("LocatableSeq argument required");
 304     }
 305     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 306         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 307     }
 308     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 309     $order--; # jay's patch (user-specified order is 1-origin)
 310
 311     if ($order < 0) {
 312         $self->throw("User-specified value for ORDER must be >= 1");
 313     }
 314
 315     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 316
 317     # build the symbol list for this sequence,
 318     # will prune out the gap and missing/match chars
 319     # when actually asked for the symbol list in the
 320     # symbol_chars
 321     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 322
 323     $name = $seq->get_nse;
 324
 325     if( $self->{'_seq'}->{$name} ) {
 326         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 327     }
 328     else {
 329         $self->debug( "Assigning $name to $order\n");
 330
 331     my $ordh = $self->{'_order'};
 332     if ($ordh->{$order}) {
 333         # make space to insert
 334         # $c->() returns (in reverse order) the first subsequence
 335         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 336         # (3,2,1), and $c->(2,4,5) returns (2).
 337         my $c;
 338         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 339         map {
 340      $ordh->{$_+1} = $ordh->{$_}
 341         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 342
 343     }
 344     $ordh->{$order} = $name;
 345
 346         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 347             $self->{'_start_end_lists'}->{$id} = [];
 348         }
 349         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 350     }
 351
 352     $self->{'_seq'}->{$name} = $seq;
 353
 354 }
 355
 356
 357 =head2 remove_seq
 358
 359  Title     : remove_seq
 360  Usage     : $aln->remove_seq($seq);
 361  Function  : Removes a single sequence from an alignment
 362  Returns   :
 363  Argument  : a Bio::LocatableSeq object
 364
 365 =cut
 366
 367 sub removeSeq {
 368     my $self = shift;
 369     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 370     $self->remove_seq(@_);
 371 }
 372
 373 sub remove_seq {
 374     my $self = shift;
 375     my $seq = shift;
 376     my ($name,$id,$start,$end);
 377
 378     $self->throw("Need Bio::Locatable seq argument ")
 379         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 380
 381     $id = $seq->id();
 382     $start = $seq->start();
 383     $end  = $seq->end();
 384     $name = sprintf("%s/%d-%d",$id,$start,$end);
 385
 386     if( !exists $self->{'_seq'}->{$name} ) {
 387         $self->throw("Sequence $name does not exist in the alignment to remove!");
 388     }
 389
 390     delete $self->{'_seq'}->{$name};
 391
 392     # we need to remove this seq from the start_end_lists hash
 393
 394     if (exists $self->{'_start_end_lists'}->{$id}) {
 395         # we need to find the sequence in the array.
 396
 397         my ($i, $found);;
 398         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 399             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 400                 $found = 1;
 401                 last;
 402             }
 403         }
 404         if ($found) {
 405             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 406         }
 407         else {
 408             $self->throw("Could not find the sequence to remoce from the start-end list");
 409         }
 410     }
 411     else {
 412         $self->throw("There is no seq list for the name $id");
 413     }
 414     # we need to shift order hash
 415     my %rev_order = reverse %{$self->{'_order'}};
 416     my $no = $rev_order{$name};
 417     my $num_sequences = $self->num_sequences;
 418     for (; $no < $num_sequences; $no++) {
 419        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 420     }
 421     delete $self->{'_order'}->{$no};
 422     return 1;
 423 }
 424
 425
 426 =head2 purge
 427
 428  Title   : purge
 429  Usage   : $aln->purge(0.7);
 430  Function: Removes sequences above given sequence similarity
 431            This function will grind on large alignments. Beware!
 432  Example :
 433  Returns : An array of the removed sequences
 434  Args    : float, threshold for similarity
 435
 436 =cut
 437
 438 sub purge {
 439         my ($self,$perc) = @_;
 440         my (%duplicate, @dups);
 441
 442         my @seqs = $self->each_seq();
 443
 444         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 445                 my $seq = $seqs[$i];
 446
 447                 #skip if already in duplicate hash
 448                 next if exists $duplicate{$seq->display_id} ;
 449                 my $one = $seq->seq();
 450
 451                 my @one = split '', $one;       #split to get 1aa per array element
 452
 453                 for (my $j=$i+1;$j < @seqs;$j++) {
 454                         my $seq2 = $seqs[$j];
 455
 456                         #skip if already in duplicate hash
 457                         next if exists $duplicate{$seq2->display_id} ;
 458
 459                         my $two = $seq2->seq();
 460                         my @two = split '', $two;
 461
 462                         my $count = 0;
 463                         my $res = 0;
 464                         for (my $k=0;$k<@one;$k++) {
 465                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 466                                           $one[$k] eq $two[$k]) {
 467                                         $count++;
 468                                 }
 469                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 470                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 471                                         $res++;
 472                                 }
 473                         }
 474
 475                         my $ratio = 0;
 476                         $ratio = $count/$res unless $res == 0;
 477
 478                         # if above threshold put in duplicate hash and push onto
 479                         # duplicate array for returning to get_unique
 480                         if ( $ratio > $perc ) {
 481                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 482                                 $duplicate{$seq2->display_id} = 1;
 483                                 push @dups, $seq2;
 484                         }
 485                 }
 486         }
 487         foreach my $seq (@dups) {
 488                 $self->remove_seq($seq);
 489         }
 490         return @dups;
 491 }
 492
 493 =head2 sort_alphabetically
 494
 495  Title     : sort_alphabetically
 496  Usage     : $ali->sort_alphabetically
 497  Function  : Changes the order of the alignment to alphabetical on name
 498              followed by numerical by number.
 499  Returns   :
 500  Argument  :
 501
 502 =cut
 503
 504 sub sort_alphabetically {
 505     my $self = shift;
 506     my ($seq,$nse,@arr,%hash,$count);
 507
 508     foreach $seq ( $self->each_seq() ) {
 509         $nse = $seq->get_nse;
 510         $hash{$nse} = $seq;
 511     }
 512
 513     $count = 0;
 514
 515     %{$self->{'_order'}} = (); # reset the hash;
 516
 517     foreach $nse ( sort _alpha_startend keys %hash) {
 518         $self->{'_order'}->{$count} = $nse;
 519
 520         $count++;
 521     }
 522     1;
 523 }
 524
 525 =head2 sort_by_list
 526
 527  Title     : sort_by_list
 528  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 529  Function  : Arbitrarily order sequences in an alignment
 530  Returns   : A new Bio::SimpleAlign object
 531  Argument  : a file listing sequence names in intended order (one name per line)
 532
 533 =cut
 534
 535 sub sort_by_list {
 536     my ($self, $list) = @_;
 537     my (@seq, @ids, %order);
 538
 539     foreach my $seq ( $self->each_seq() ) {
 540         push @seq, $seq;
 541         push @ids, $seq->display_id;
 542     }
 543
 544     my $ct=1;
 545     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 546     while (<$listfh>) {
 547       chomp;
 548       my $name=$_;
 549       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 550       $order{$name}=$ct++;
 551     }
 552     close($listfh);
 553
 554     # use the map-sort-map idiom:
 555     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 556     my $aln = $self->new;
 557     foreach (@sorted) { $aln->add_seq($_) }
 558     return $aln;
 559 }
 560
 561 =head2 set_new_reference
 562
 563  Title     : set_new_reference
 564  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 565              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 566              as the reference (1st) sequence
 567  Function  : Change/Set a new reference (i.e., the first) sequence
 568  Returns   : a new Bio::SimpleAlign object.
 569              Throws an exception if designated sequence not found
 570  Argument  : a positive integer of sequence order, or a sequence name
 571              in the original alignment
 572
 573 =cut
 574
 575 sub set_new_reference {
 576     my ($self, $seqid) = @_;
 577     my $aln = $self->new;
 578     my (@seq, @ids, @new_seq);
 579     my $is_num=0;
 580     foreach my $seq ( $self->each_seq() ) {
 581         push @seq, $seq;
 582         push @ids, $seq->display_id;
 583     }
 584
 585     if ($seqid =~ /^\d+$/) { # argument is seq position
 586         $is_num=1;
 587         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 588     } else { # argument is a seq name
 589         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 590     }
 591
 592     for (my $i=0; $i<=$#seq; $i++) {
 593         my $pos=$i+1;
 594         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 595             unshift @new_seq, $seq[$i];
 596         } else {
 597             push @new_seq, $seq[$i];
 598         }
 599     }
 600     foreach (@new_seq) { $aln->add_seq($_);  }
 601     return $aln;
 602 }
 603
 604 sub _in_aln {  # check if input name exists in the alignment
 605     my ($str, $ref) = @_;
 606     foreach (@$ref) {
 607         return 1 if $str eq $_;
 608     }
 609     return 0;
 610 }
 611
 612
 613 =head2 uniq_seq
 614
 615  Title     : uniq_seq
 616  Usage     : $aln->uniq_seq():  Remove identical sequences in
 617              in the alignment.  Ambiguous base ("N", "n") and
 618              leading and ending gaps ("-") are NOT counted as
 619              differences.
 620  Function  : Make a new alignment of unique sequence types (STs)
 621  Returns   : 1a. if called in a scalar context,
 622                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 623              1b. if called in an array context,
 624                 a new Bio::SimpleAlign object, and a hashref whose keys
 625                 are sequence types, and whose values are arrayrefs to
 626                 lists of sequence ids within the corresponding sequence type
 627              2. if $aln->verbose > 0, ST of each sequence is sent to
 628                 STDERR (in a tabular format)
 629  Argument  : None
 630
 631 =cut
 632
 633 sub uniq_seq {
 634     my ($self, $seqid) = @_;
 635     my $aln = $self->new;
 636     my (%member, %order, @seq, @uniq_str, $st);
 637     my $order=0;
 638     my $len = $self->length();
 639     $st = {};
 640     foreach my $seq ( $self->each_seq() ) {
 641         my $str = $seq->seq();
 642
 643 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 644 # comparing two sequence strings
 645
 646     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 647         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 648     # 2nd, convert leading and ending gaps to "?":
 649         $str = &_convert_leading_ending_gaps($str, '-', '?');
 650     # Note that '?' also can mean unknown residue.
 651     # I don't like making global class member changes like this, too
 652     # prone to errors... -- cjfields 08-11-18
 653     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 654         my $new = Bio::LocatableSeq->new(
 655                      -id      => $seq->id(),
 656                                          -alphabet=> $seq->alphabet,
 657                                          -seq     => $str,
 658                                          -start   => $seq->start,
 659                                          -end     => $seq->end
 660                                          );
 661         push @seq, $new;
 662     }
 663
 664     foreach my $seq (@seq) {
 665         my $str = $seq->seq();
 666         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 667         if ($seen) { # seen before
 668             my @memb = @{$member{$key}};
 669             push @memb, $seq;
 670             $member{$key} = \@memb;
 671         } else {  # not seen
 672             push @uniq_str, $key;
 673             $order++;
 674             $member{$key} = [ ($seq) ];
 675             $order{$key} = $order;
 676         }
 677     }
 678
 679     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 680 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 681         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 682 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 683         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 684         my $gap='-';
 685         my $end= CORE::length($str2);
 686         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 687         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 688                                          -seq  =>$str2,
 689                                          -start=>1,
 690                                          -end  =>$end
 691                                          );
 692         $aln->add_seq($new);
 693         foreach (@{$member{$str}}) {
 694             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 695         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 696         }
 697     }
 698     return wantarray ? ($aln, $st) : $aln;
 699 }
 700
 701 sub _check_uniq {  # check if same seq exists in the alignment
 702     my ($str1, $ref, $length) = @_;
 703     my @char1=split //, $str1;
 704     my @array=@$ref;
 705
 706     return (0, $str1) if @array==0; # not seen (1st sequence)
 707
 708     foreach my $str2 (@array) {
 709         my $diff=0;
 710         my @char2=split //, $str2;
 711         for (my $i=0; $i<=$length-1; $i++) {
 712             next if $char1[$i] eq '?';
 713             next if $char2[$i] eq '?';
 714             $diff++ if $char1[$i] ne $char2[$i];
 715         }
 716         return (1, $str2) if $diff == 0;  # seen before
 717     }
 718
 719     return (0, $str1); # not seen
 720 }
 721
 722 sub _convert_leading_ending_gaps {
 723     my $s=shift;
 724     my $sym1=shift;
 725     my $sym2=shift;
 726     my @array=split //, $s;
 727 # convert leading char:
 728     for (my $i=0; $i<=$#array; $i++) {
 729         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 730     }
 731 # convert ending char:
 732     for (my $i = $#array; $i>= 0; $i--) {
 733         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 734     }
 735     my $s_new=join '', @array;
 736     return $s_new;
 737 }
 738
 739 =head1 Sequence selection methods
 740
 741 Methods returning one or more sequences objects.
 742
 743 =head2 each_seq
 744
 745  Title     : each_seq
 746  Usage     : foreach $seq ( $align->each_seq() )
 747  Function  : Gets a Seq object from the alignment
 748  Returns   : Seq object
 749  Argument  :
 750
 751 =cut
 752
 753 sub eachSeq {
 754     my $self = shift;
 755     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 756     $self->each_seq();
 757 }
 758
 759 sub each_seq {
 760         my $self = shift;
 761         my (@arr,$order);
 762
 763         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 764                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 765                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 766                 }
 767         }
 768         return @arr;
 769 }
 770
 771
 772 =head2 each_alphabetically
 773
 774  Title     : each_alphabetically
 775  Usage     : foreach $seq ( $ali->each_alphabetically() )
 776  Function  : Returns a sequence object, but the objects are returned
 777              in alphabetically sorted order.
 778              Does not change the order of the alignment.
 779  Returns   : Seq object
 780  Argument  :
 781
 782 =cut
 783
 784 sub each_alphabetically {
 785         my $self = shift;
 786         my ($seq,$nse,@arr,%hash,$count);
 787
 788         foreach $seq ( $self->each_seq() ) {
 789                 $nse = $seq->get_nse;
 790                 $hash{$nse} = $seq;
 791         }
 792
 793         foreach $nse ( sort _alpha_startend keys %hash) {
 794                 push(@arr,$hash{$nse});
 795         }
 796         return @arr;
 797 }
 798
 799 sub _alpha_startend {
 800     my ($aname,$astart,$bname,$bstart);
 801     ($aname,$astart) = split (/-/,$a);
 802     ($bname,$bstart) = split (/-/,$b);
 803
 804     if( $aname eq $bname ) {
 805         return $astart <=> $bstart;
 806     }
 807     else {
 808         return $aname cmp $bname;
 809     }
 810 }
 811
 812 =head2 each_seq_with_id
 813
 814  Title     : each_seq_with_id
 815  Usage     : foreach $seq ( $align->each_seq_with_id() )
 816  Function  : Gets a Seq objects from the alignment, the contents
 817              being those sequences with the given name (there may be
 818              more than one)
 819  Returns   : Seq object
 820  Argument  : a seq name
 821
 822 =cut
 823
 824 sub eachSeqWithId {
 825     my $self = shift;
 826     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 827     $self->each_seq_with_id(@_);
 828 }
 829
 830 sub each_seq_with_id {
 831     my $self = shift;
 832     my $id = shift;
 833
 834     $self->throw("Method each_seq_with_id needs a sequence name argument")
 835         unless defined $id;
 836
 837     my (@arr, $seq);
 838
 839     if (exists($self->{'_start_end_lists'}->{$id})) {
 840         @arr = @{$self->{'_start_end_lists'}->{$id}};
 841     }
 842     return @arr;
 843 }
 844
 845 =head2 get_seq_by_pos
 846
 847  Title     : get_seq_by_pos
 848  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 849  Function  : Gets a sequence based on its position in the alignment.
 850              Numbering starts from 1.  Sequence positions larger than
 851              num_sequences() will thow an error.
 852  Returns   : a Bio::LocatableSeq object
 853  Args      : positive integer for the sequence position
 854
 855 =cut
 856
 857 sub get_seq_by_pos {
 858
 859     my $self = shift;
 860     my ($pos) = @_;
 861
 862     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 863         unless $pos =~ /^\d+$/ and $pos > 0;
 864     $self->throw("No sequence at position [$pos]")
 865         unless $pos <= $self->num_sequences ;
 866
 867     my $nse = $self->{'_order'}->{--$pos};
 868     return $self->{'_seq'}->{$nse};
 869 }
 870
 871 =head2 get_seq_by_id
 872
 873  Title     : get_seq_by_id
 874  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 875  Function  : Gets a sequence based on its name.
 876              Sequences that do not exist will warn and return undef
 877  Returns   : a Bio::LocatableSeq object
 878  Args      : string for sequence name
 879
 880 =cut
 881
 882 sub get_seq_by_id {
 883     my ($self,$name) = @_;
 884     unless( defined $name ) {
 885       $self->warn("Must provide a sequence name");
 886       return;
 887     }
 888     for my $seq ( values %{$self->{'_seq'}} ) {
 889       if ( $seq->id eq $name) {
 890         return $seq;
 891       }
 892     }
 893     return;
 894 }
 895
 896 =head2 seq_with_features
 897
 898  Title   : seq_with_features
 899  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 900                                           -consensus => 60
 901                                           -mask =>
 902            sub { my $consensus = shift;
 903
 904                  for my $i (1..5){
 905                     my $n = 'N' x $i;
 906                     my $q = '\?' x $i;
 907                     while($consensus =~ /[^?]$q[^?]/){
 908                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 909                     }
 910                   }
 911                  return $consensus;
 912                }
 913                                          );
 914  Function: produces a Bio::Seq object by first splicing gaps from -pos
 915            (by means of a splice_by_seq_pos() call), then creating
 916            features using non-? chars (by means of a consensus_string()
 917            call with stringency -consensus).
 918  Returns : a Bio::Seq object
 919  Args    : -pos : required. sequence from which to build the Bio::Seq
 920              object
 921            -consensus : optional, defaults to consensus_string()'s
 922              default cutoff value
 923            -mask : optional, a coderef to apply to consensus_string()'s
 924              output before building features.  this may be useful for
 925              closing gaps of 1 bp by masking over them with N, for
 926              instance
 927
 928 =cut
 929
 930 sub seq_with_features{
 931    my ($self,%arg) = @_;
 932
 933    #first do the preparatory splice
 934    $self->throw("must provide a -pos argument") unless $arg{-pos};
 935    $self->splice_by_seq_pos($arg{-pos});
 936
 937    my $consensus_string = $self->consensus_string($arg{-consensus});
 938    $consensus_string = $arg{-mask}->($consensus_string)
 939          if defined($arg{-mask});
 940
 941    my(@bs,@es);
 942
 943    push @bs, 1 if $consensus_string =~ /^[^?]/;
 944
 945    while($consensus_string =~ /\?[^?]/g){
 946          push @bs, pos($consensus_string);
 947    }
 948    while($consensus_string =~ /[^?]\?/g){
 949          push @es, pos($consensus_string);
 950    }
 951
 952    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 953
 954    my $seq = Bio::Seq->new();
 955
 956 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 957 #                -source_tag => 'location',
 958 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 959 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 960 #                                                  );
 961 #   $seq->add_SeqFeature($rootfeature);
 962
 963    while(my $b = shift @bs){
 964          my $e = shift @es;
 965          $seq->add_SeqFeature(
 966        Bio::SeqFeature::Generic->new(
 967          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 968          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 969          -source_tag => $self->source || 'MSA',
 970        )
 971      );
 972    }
 973
 974    return $seq;
 975 }
 976
 977
 978 =head1 Create new alignments
 979
 980 The result of these methods are horizontal or vertical subsets of the
 981 current MSA.
 982
 983 =head2 select
 984
 985  Title     : select
 986  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 987  Function  : Creates a new alignment from a continuous subset of
 988              sequences.  Numbering starts from 1.  Sequence positions
 989              larger than num_sequences() will thow an error.
 990  Returns   : a Bio::SimpleAlign object
 991  Args      : positive integer for the first sequence
 992              positive integer for the last sequence to include (optional)
 993
 994 =cut
 995
 996 sub select {
 997     my $self = shift;
 998     my ($start, $end) = @_;
 999
1000     $self->throw("Select start has to be a positive integer, not [$start]")
1001         unless $start =~ /^\d+$/ and $start > 0;
1002     $self->throw("Select end has to be a positive integer, not [$end]")
1003         unless $end  =~ /^\d+$/ and $end > 0;
1004     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1005         unless $start <= $end;
1006
1007     my $aln = $self->new;
1008     foreach my $pos ($start .. $end) {
1009         $aln->add_seq($self->get_seq_by_pos($pos));
1010     }
1011     $aln->id($self->id);
1012     # fix for meta, sf, ann
1013     return $aln;
1014 }
1015
1016 =head2 select_noncont
1017
1018  Title     : select_noncont
1019  Usage     : # 1st and 3rd sequences, sorted
1020              $aln2 = $aln->select_noncont(1, 3)
1021
1022              # 1st and 3rd sequences, sorted (same as first)
1023              $aln2 = $aln->select_noncont(3, 1)
1024
1025              # 1st and 3rd sequences, unsorted
1026              $aln2 = $aln->select_noncont('nosort',3, 1)
1027
1028  Function  : Creates a new alignment from a subset of sequences.  Numbering
1029              starts from 1.  Sequence positions larger than num_sequences() will
1030              throw an error.  Sorts the order added to new alignment by default,
1031              to prevent sorting pass 'nosort' as the first argument in the list.
1032  Returns   : a Bio::SimpleAlign object
1033  Args      : array of integers for the sequences.  If the string 'nosort' is
1034              passed as the first argument, the sequences will not be sorted
1035              in the new alignment but will appear in the order listed.
1036
1037 =cut
1038
1039 sub select_noncont {
1040         my $self = shift;
1041     my $nosort = 0;
1042         my (@pos) = @_;
1043     if ($pos[0] !~ m{^\d+$}) {
1044         my $sortcmd = shift @pos;
1045         if ($sortcmd eq 'nosort') {
1046             $nosort = 1;
1047         } else {
1048             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1049         }
1050     }
1051
1052     my $end = $self->num_sequences;
1053     foreach ( @pos ) {
1054                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1055                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1056         }
1057
1058         @pos = sort {$a <=> $b} @pos unless $nosort;
1059
1060         my $aln = $self->new;
1061         foreach my $p (@pos) {
1062                 $aln->add_seq($self->get_seq_by_pos($p));
1063         }
1064         $aln->id($self->id);
1065     # fix for meta, sf, ann
1066         return $aln;
1067 }
1068
1069 =head2 slice
1070
1071  Title     : slice
1072  Usage     : $aln2 = $aln->slice(20,30)
1073  Function  : Creates a slice from the alignment inclusive of start and
1074              end columns, and the first column in the alignment is denoted 1.
1075              Sequences with no residues in the slice are excluded from the
1076              new alignment and a warning is printed. Slice beyond the length of
1077              the sequence does not do padding.
1078  Returns   : A Bio::SimpleAlign object
1079  Args      : Positive integer for start column, positive integer for end column,
1080              optional boolean which if true will keep gap-only columns in the newly
1081              created slice. Example:
1082
1083              $aln2 = $aln->slice(20,30,1)
1084
1085 =cut
1086
1087 sub slice {
1088         my $self = shift;
1089         my ($start, $end, $keep_gap_only) = @_;
1090
1091         $self->throw("Slice start has to be a positive integer, not [$start]")
1092           unless $start =~ /^\d+$/ and $start > 0;
1093         $self->throw("Slice end has to be a positive integer, not [$end]")
1094           unless $end =~ /^\d+$/ and $end > 0;
1095         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1096           unless $start <= $end;
1097         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1098                                          "[$start] is too big.") if $start > $self->length;
1099     my $cons_meta = $self->consensus_meta;
1100         my $aln = $self->new;
1101         $aln->id($self->id);
1102         foreach my $seq ( $self->each_seq() ) {
1103             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1104             Bio::Seq::Meta->new
1105         (-id      => $seq->id,
1106                  -alphabet => $seq->alphabet,
1107                  -strand  => $seq->strand,
1108                  -verbose => $self->verbose) :
1109             Bio::LocatableSeq->new
1110         (-id      => $seq->id,
1111                  -alphabet => $seq->alphabet,
1112                  -strand  => $seq->strand,
1113                  -verbose => $self->verbose);
1114
1115             # seq
1116             my $seq_end = $end;
1117             $seq_end = $seq->length if( $end > $seq->length );
1118
1119             my $slice_seq = $seq->subseq($start, $seq_end);
1120             $new_seq->seq( $slice_seq );
1121
1122             $slice_seq =~ s/\W//g;
1123
1124             if ($start > 1) {
1125             my $pre_start_seq = $seq->subseq(1, $start - 1);
1126             $pre_start_seq =~ s/\W//g;
1127             if (!defined($seq->strand)) {
1128                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1129             } elsif ($seq->strand < 0){
1130                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1131             } else {
1132                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1133             }
1134             } else {
1135             if ((defined $seq->strand)&&($seq->strand < 0)){
1136                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1137             } else {
1138                $new_seq->start( $seq->start);
1139             }
1140             }
1141         if ($new_seq->isa('Bio::Seq::MetaI')) {
1142             for my $meta_name ($seq->meta_names) {
1143                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1144             }
1145         }
1146             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1147
1148             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1149             $aln->add_seq($new_seq);
1150             } else {
1151             if( $keep_gap_only ) {
1152                 $aln->add_seq($new_seq);
1153             } else {
1154                 my $nse = $seq->get_nse();
1155                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1156                     " Sequence excluded from the new alignment.");
1157             }
1158             }
1159         }
1160     if ($cons_meta) {
1161         my $new = Bio::Seq::Meta->new();
1162         for my $meta_name ($cons_meta->meta_names) {
1163             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1164         }
1165         $aln->consensus_meta($new);
1166     }
1167     $aln->annotation($self->annotation);
1168     # fix for meta, sf, ann
1169         return $aln;
1170 }
1171
1172 =head2 remove_columns
1173
1174  Title     : remove_columns
1175  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1176              $aln2 = $aln->remove_columns([0,0],[6,8])
1177  Function  : Creates an aligment with columns removed corresponding to
1178              the specified type or by specifying the columns by number.
1179  Returns   : Bio::SimpleAlign object
1180  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1181              'all_gaps_columns') or array ref where the referenced array
1182              contains a pair of integers that specify a range.
1183              The first column is 0
1184
1185 =cut
1186
1187 sub remove_columns {
1188     my ($self,@args) = @_;
1189     @args || $self->throw("Must supply column ranges or column types");
1190     my $aln;
1191
1192     if ($args[0][0] =~ /^[a-z_]+$/i) {
1193         $aln = $self->_remove_columns_by_type($args[0]);
1194     } elsif ($args[0][0] =~ /^\d+$/) {
1195         $aln = $self->_remove_columns_by_num(\@args);
1196     } else {
1197         $self->throw("You must pass array references to remove_columns(), not @args");
1198     }
1199     # fix for meta, sf, ann
1200     $aln;
1201 }
1202
1203
1204 =head2 remove_gaps
1205
1206  Title     : remove_gaps
1207  Usage     : $aln2 = $aln->remove_gaps
1208  Function  : Creates an aligment with gaps removed
1209  Returns   : a Bio::SimpleAlign object
1210  Args      : a gap character(optional) if none specified taken
1211                 from $self->gap_char,
1212              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1213                         indicates that only all-gaps columns should be deleted
1214
1215 Used from method L<remove_columns> in most cases. Set gap character
1216 using L<gap_char()|gap_char>.
1217
1218 =cut
1219
1220 sub remove_gaps {
1221     my ($self,$gapchar,$all_gaps_columns) = @_;
1222     my $gap_line;
1223     if ($all_gaps_columns) {
1224         $gap_line = $self->all_gap_line($gapchar);
1225     } else {
1226         $gap_line = $self->gap_line($gapchar);
1227     }
1228     my $aln = $self->new;
1229
1230     my @remove;
1231     my $length = 0;
1232     my $del_char = $gapchar || $self->gap_char;
1233     # Do the matching to get the segments to remove
1234     while ($gap_line =~ m/[$del_char]/g) {
1235         my $start = pos($gap_line)-1;
1236         $gap_line=~/\G[$del_char]+/gc;
1237         my $end = pos($gap_line)-1;
1238
1239         #have to offset the start and end for subsequent removes
1240         $start-=$length;
1241         $end  -=$length;
1242         $length += ($end-$start+1);
1243         push @remove, [$start,$end];
1244     }
1245
1246     #remove the segments
1247     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1248     # fix for meta, sf, ann
1249     return $aln;
1250 }
1251
1252
1253 sub _remove_col {
1254     my ($self,$aln,$remove) = @_;
1255     my @new;
1256
1257     my $gap = $self->gap_char;
1258
1259     # splice out the segments and create new seq
1260     foreach my $seq($self->each_seq){
1261         my $new_seq = Bio::LocatableSeq->new(
1262                                              -id      => $seq->id,
1263                                              -alphabet=> $seq->alphabet,
1264                                              -strand  => $seq->strand,
1265                                              -verbose => $self->verbose);
1266         my $sequence = $seq->seq;
1267         foreach my $pair(@{$remove}){
1268             my $start = $pair->[0];
1269             my $end   = $pair->[1];
1270             $sequence = $seq->seq unless $sequence;
1271             my $orig = $sequence;
1272             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1273             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1274             $sequence = $head.$tail;
1275             # start
1276             unless (defined $new_seq->start) {
1277                 if ($start == 0) {
1278                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1279                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1280                 }
1281                 else {
1282                     my $start_adjust = $orig =~ /^$gap+/;
1283                     if ($start_adjust) {
1284                         $start_adjust = $+[0] == $start;
1285                     }
1286                     $new_seq->start($seq->start + $start_adjust);
1287                 }
1288             }
1289             # end
1290             if (($end + 1) >= CORE::length($orig)) {
1291                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1292                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1293             }
1294             else {
1295                 $new_seq->end($seq->end);
1296             }
1297         }
1298
1299         if ($new_seq->end < $new_seq->start) {
1300             # we removed all columns except for gaps: set to 0 to indicate no
1301             # sequence
1302             $new_seq->start(0);
1303             $new_seq->end(0);
1304         }
1305
1306         $new_seq->seq($sequence) if $sequence;
1307                 push @new, $new_seq;
1308     }
1309     # add the new seqs to the alignment
1310     foreach my $new(@new){
1311         $aln->add_seq($new);
1312     }
1313     # fix for meta, sf, ann
1314     return $aln;
1315 }
1316
1317 sub _remove_columns_by_type {
1318         my ($self,$type) = @_;
1319         my $aln = $self->new;
1320         my @remove;
1321
1322         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1323         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1324         my %matchchars = ( 'match'           => '\*',
1325                        'weak'             => '\.',
1326                        'strong'           => ':',
1327                        'mismatch'         => ' ',
1328                        'gaps'             => '',
1329                        'all_gaps_columns' => ''
1330                      );
1331         # get the characters to delete against
1332         my $del_char;
1333         foreach my $type (@{$type}){
1334                 $del_char.= $matchchars{$type};
1335         }
1336
1337         my $length = 0;
1338         my $match_line = $self->match_line;
1339         # do the matching to get the segments to remove
1340         if($del_char){
1341                 while($match_line =~ m/[$del_char]/g ){
1342                         my $start = pos($match_line)-1;
1343                         $match_line=~/\G[$del_char]+/gc;
1344                         my $end = pos($match_line)-1;
1345
1346                         #have to offset the start and end for subsequent removes
1347                         $start-=$length;
1348                         $end  -=$length;
1349                         $length += ($end-$start+1);
1350                         push @remove, [$start,$end];
1351                 }
1352         }
1353
1354         # remove the segments
1355         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1356         $aln = $aln->remove_gaps() if $gap;
1357         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1358     # fix for meta, sf, ann
1359         $aln;
1360 }
1361
1362
1363 sub _remove_columns_by_num {
1364         my ($self,$positions) = @_;
1365         my $aln = $self->new;
1366
1367         # sort the positions
1368         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1369
1370     my @remove;
1371     my $length = 0;
1372     foreach my $pos (@{$positions}) {
1373         my ($start, $end) = @{$pos};
1374
1375         #have to offset the start and end for subsequent removes
1376         $start-=$length;
1377         $end  -=$length;
1378         $length += ($end-$start+1);
1379         push @remove, [$start,$end];
1380     }
1381
1382     #remove the segments
1383     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1384     # fix for meta, sf, ann
1385         $aln;
1386 }
1387
1388
1389 =head1 Change sequences within the MSA
1390
1391 These methods affect characters in all sequences without changing the
1392 alignment.
1393
1394 =head2 splice_by_seq_pos
1395
1396  Title   : splice_by_seq_pos
1397  Usage   : $status = splice_by_seq_pos(1);
1398  Function: splices all aligned sequences where the specified sequence
1399            has gaps.
1400  Example :
1401  Returns : 1 on success
1402  Args    : position of sequence to splice by
1403
1404
1405 =cut
1406
1407 sub splice_by_seq_pos{
1408   my ($self,$pos) = @_;
1409
1410   my $guide = $self->get_seq_by_pos($pos);
1411   my $guide_seq = $guide->seq;
1412
1413   $guide_seq =~ s/\./\-/g;
1414
1415   my @gaps = ();
1416   $pos = -1;
1417   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1418     unshift @gaps, $pos;
1419     $pos++;
1420   }
1421
1422   foreach my $seq ($self->each_seq){
1423     my @bases = split '', $seq->seq;
1424
1425     splice(@bases, $_, 1) foreach @gaps;
1426     $seq->seq(join('', @bases));
1427   }
1428
1429   1;
1430 }
1431
1432 =head2 map_chars
1433
1434  Title     : map_chars
1435  Usage     : $ali->map_chars('\.','-')
1436  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1437              characters
1438
1439              Notice that the from (arg1) is interpretted as a regex,
1440              so be careful about quoting meta characters (eg
1441              $ali->map_chars('.','-') wont do what you want)
1442  Returns   :
1443  Argument  : 'from' rexexp
1444              'to' string
1445
1446 =cut
1447
1448 sub map_chars {
1449     my $self = shift;
1450     my $from = shift;
1451     my $to   = shift;
1452     my ($seq,$temp);
1453
1454     $self->throw("Need exactly two arguments")
1455         unless defined $from and defined $to;
1456
1457     foreach $seq ( $self->each_seq() ) {
1458         $temp = $seq->seq();
1459         $temp =~ s/$from/$to/g;
1460         $seq->seq($temp);
1461     }
1462     return 1;
1463 }
1464
1465
1466 =head2 uppercase
1467
1468  Title     : uppercase()
1469  Usage     : $ali->uppercase()
1470  Function  : Sets all the sequences to uppercase
1471  Returns   :
1472  Argument  :
1473
1474 =cut
1475
1476 sub uppercase {
1477     my $self = shift;
1478     my $seq;
1479     my $temp;
1480
1481     foreach $seq ( $self->each_seq() ) {
1482       $temp = $seq->seq();
1483       $temp =~ tr/[a-z]/[A-Z]/;
1484
1485       $seq->seq($temp);
1486     }
1487     return 1;
1488 }
1489
1490 =head2 cigar_line
1491
1492  Title    : cigar_line()
1493  Usage    : %cigars = $align->cigar_line()
1494  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495             Report) line for each sequence in the alignment. Examples are
1496             "1,60" or "5,10:12,58", where the numbers refer to conserved
1497             positions within the alignment. The keys of the hash are the
1498             NSEs (name/start/end) assigned to each sequence.
1499  Args     : threshold (optional, defaults to 100)
1500  Returns  : Hash of strings (cigar lines)
1501
1502 =cut
1503
1504 sub cigar_line {
1505         my $self = shift;
1506         my $thr=shift||100;
1507         my %cigars;
1508
1509         my @consensus = split "",($self->consensus_string($thr));
1510         my $len = $self->length;
1511         my $gapchar = $self->gap_char;
1512
1513         # create a precursor, something like (1,4,5,6,7,33,45),
1514         # where each number corresponds to a conserved position
1515         foreach my $seq ( $self->each_seq ) {
1516                 my @seq = split "", uc ($seq->seq);
1517                 my $pos = 1;
1518                 for (my $x = 0 ; $x < $len ; $x++ ) {
1519                         if ($seq[$x] eq $consensus[$x]) {
1520                                 push @{$cigars{$seq->get_nse}},$pos;
1521                                 $pos++;
1522                         } elsif ($seq[$x] ne $gapchar) {
1523                                 $pos++;
1524                         }
1525                 }
1526         }
1527         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528         for my $name (keys %cigars) {
1529                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1530                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1531       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1532            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1533                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1534                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1535                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1536                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1537                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1538                         }
1539       }
1540         }
1541   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542   for my $name (keys %cigars) {
1543           my @remove;
1544           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1545                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1546                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1547                       unshift @remove,$x;
1548               }
1549            }
1550       for my $pos (@remove) {
1551                         splice @{$cigars{$name}}, $pos, 1;
1552            }
1553    }
1554    # join and punctuate
1555    for my $name (keys %cigars) {
1556           my ($start,$end,$str) = "";
1557           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1558                   $str .= ($start . "," . $end . ":");
1559           }
1560           $str =~ s/:$//;
1561       $cigars{$name} = $str;
1562    }
1563    %cigars;
1564 }
1565
1566
1567 =head2 match_line
1568
1569  Title    : match_line()
1570  Usage    : $line = $align->match_line()
1571  Function : Generates a match line - much like consensus string
1572             except that a line indicating the '*' for a match.
1573  Args     : (optional) Match line characters ('*' by default)
1574             (optional) Strong match char (':' by default)
1575             (optional) Weak match char ('.' by default)
1576  Returns  : String
1577
1578 =cut
1579
1580 sub match_line {
1581         my ($self,$matchlinechar, $strong, $weak) = @_;
1582         my %matchchars = ('match'    => $matchlinechar || '*',
1583                                                           'weak'     => $weak          || '.',
1584                                                           'strong'   => $strong        || ':',
1585                                                           'mismatch' => ' ',
1586                                                   );
1587
1588         my @seqchars;
1589         my $alphabet;
1590         foreach my $seq ( $self->each_seq ) {
1591                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592                 $alphabet = $seq->alphabet unless defined $alphabet;
1593         }
1594         my $refseq = shift @seqchars;
1595         # let's just march down the columns
1596         my $matchline;
1597  POS:
1598         foreach my $pos ( 0..$self->length ) {
1599                 my $refchar = $refseq->[$pos];
1600                 my $char = $matchchars{'mismatch'};
1601                 unless( defined $refchar ) {
1602                         last if $pos == $self->length; # short circuit on last residue
1603                         # this in place to handle jason's soon-to-be-committed
1604                         # intron mapping code
1605                         goto bottom;
1606                 }
1607                 my %col = ($refchar => 1);
1608                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609                 foreach my $seq ( @seqchars ) {
1610                         next if $pos >= scalar @$seq;
1611                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1612                                                           $seq->[$pos] eq ' ' );
1613                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1614                 }
1615                 my @colresidues = sort keys %col;
1616
1617                 # if all the values are the same
1618                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1619                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621                         # matches for protein seqs
1622             TYPE:
1623                         foreach my $type ( qw(strong weak) ) {
1624                                 # iterate through categories
1625                                 my %groups;
1626                                 # iterate through each of the aa in the col
1627                                 # look to see which groups it is in
1628                                 foreach my $c ( @colresidues ) {
1629                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1630                                                 push @{$groups{$f}},$c;
1631                                         }
1632                                 }
1633                          GRP:
1634                                 foreach my $cols ( values %groups ) {
1635                                         @$cols = sort @$cols;
1636                                         # now we are just testing to see if two arrays
1637                                         # are identical w/o changing either one
1638                                         # have to be same len
1639                                         next if( scalar @$cols != scalar @colresidues );
1640                                         # walk down the length and check each slot
1641                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1642                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1643                                         }
1644                                         $char = $matchchars{$type};
1645                                         last TYPE;
1646                                 }
1647                         }
1648                 }
1649          bottom:
1650                 $matchline .= $char;
1651         }
1652         return $matchline;
1653 }
1654
1655
1656 =head2 gap_line
1657
1658  Title    : gap_line()
1659  Usage    : $line = $align->gap_line()
1660  Function : Generates a gap line - much like consensus string
1661             except that a line where '-' represents gap
1662  Args     : (optional) gap line characters ('-' by default)
1663  Returns  : string
1664
1665 =cut
1666
1667 sub gap_line {
1668     my ($self,$gapchar) = @_;
1669     $gapchar = $gapchar || $self->gap_char;
1670     my %gap_hsh; # column gaps vector
1671     foreach my $seq ( $self->each_seq ) {
1672                 my $i = 0;
1673         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1674                   map {[$i++, $_]} split(//, uc ($seq->seq));
1675     }
1676     my $gap_line;
1677     foreach my $pos ( 0..$self->length-1 ) {
1678           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1679     }
1680     return $gap_line;
1681 }
1682
1683 =head2 all_gap_line
1684
1685  Title    : all_gap_line()
1686  Usage    : $line = $align->all_gap_line()
1687  Function : Generates a gap line - much like consensus string
1688             except that a line where '-' represents all-gap column
1689  Args     : (optional) gap line characters ('-' by default)
1690  Returns  : string
1691
1692 =cut
1693
1694 sub all_gap_line {
1695     my ($self,$gapchar) = @_;
1696     $gapchar = $gapchar || $self->gap_char;
1697     my %gap_hsh;                # column gaps counter hash
1698     my @seqs = $self->each_seq;
1699     foreach my $seq ( @seqs ) {
1700         my $i = 0;
1701         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1702         map {[$i++, $_]} split(//, uc ($seq->seq));
1703     }
1704     my $gap_line;
1705     foreach my $pos ( 0..$self->length-1 ) {
1706         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707             # gaps column
1708             $gap_line .= $gapchar;
1709         } else {
1710             $gap_line .= '.';
1711         }
1712     }
1713     return $gap_line;
1714 }
1715
1716 =head2 gap_col_matrix
1717
1718  Title    : gap_col_matrix()
1719  Usage    : my $cols = $align->gap_col_matrix()
1720  Function : Generates an array of hashes where
1721             each entry in the array is a hash reference
1722             with keys of all the sequence names and
1723             and value of 1 or 0 if the sequence has a gap at that column
1724  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1725
1726 =cut
1727
1728 sub gap_col_matrix {
1729     my ($self,$gapchar) = @_;
1730     $gapchar = $gapchar || $self->gap_char;
1731     my %gap_hsh; # column gaps vector
1732     my @cols;
1733     foreach my $seq ( $self->each_seq ) {
1734         my $i = 0;
1735         my $str = $seq->seq;
1736         my $len = $seq->length;
1737         my $ch;
1738         my $id = $seq->display_id;
1739         while( $i < $len ) {
1740             $ch = substr($str, $i, 1);
1741             $cols[$i++]->{$id} = ($ch eq $gapchar);
1742         }
1743     }
1744     return \@cols;
1745 }
1746
1747 =head2 match
1748
1749  Title     : match()
1750  Usage     : $ali->match()
1751  Function  : Goes through all columns and changes residues that are
1752              identical to residue in first sequence to match '.'
1753              character. Sets match_char.
1754
1755              USE WITH CARE: Most MSA formats do not support match
1756              characters in sequences, so this is mostly for output
1757              only. NEXUS format (Bio::AlignIO::nexus) can handle
1758              it.
1759  Returns   : 1
1760  Argument  : a match character, optional, defaults to '.'
1761
1762 =cut
1763
1764 sub match {
1765     my ($self, $match) = @_;
1766
1767     $match ||= '.';
1768     my ($matching_char) = $match;
1769     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1770     $self->map_chars($matching_char, '-');
1771
1772     my @seqs = $self->each_seq();
1773     return 1 unless scalar @seqs > 1;
1774
1775     my $refseq = shift @seqs ;
1776     my @refseq = split //, $refseq->seq;
1777     my $gapchar = $self->gap_char;
1778
1779     foreach my $seq ( @seqs ) {
1780         my @varseq = split //, $seq->seq();
1781         for ( my $i=0; $i < scalar @varseq; $i++) {
1782             $varseq[$i] = $match if defined $refseq[$i] &&
1783                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1784                   $refseq[$i] =~ /$gapchar/ )
1785                       && $refseq[$i] eq $varseq[$i];
1786         }
1787         $seq->seq(join '', @varseq);
1788     }
1789     $self->match_char($match);
1790     return 1;
1791 }
1792
1793
1794 =head2 unmatch
1795
1796  Title     : unmatch()
1797  Usage     : $ali->unmatch()
1798  Function  : Undoes the effect of method match. Unsets match_char.
1799  Returns   : 1
1800  Argument  : a match character, optional, defaults to '.'
1801
1802 See L<match> and L<match_char>
1803
1804 =cut
1805
1806 sub unmatch {
1807     my ($self, $match) = @_;
1808
1809     $match ||= '.';
1810
1811     my @seqs = $self->each_seq();
1812     return 1 unless scalar @seqs > 1;
1813
1814     my $refseq = shift @seqs ;
1815     my @refseq = split //, $refseq->seq;
1816     my $gapchar = $self->gap_char;
1817     foreach my $seq ( @seqs ) {
1818         my @varseq = split //, $seq->seq();
1819         for ( my $i=0; $i < scalar @varseq; $i++) {
1820             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1821                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1822                   $refseq[$i] =~ /$gapchar/ ) &&
1823                       $varseq[$i] eq $match;
1824         }
1825         $seq->seq(join '', @varseq);
1826     }
1827     $self->match_char('');
1828     return 1;
1829 }
1830
1831 =head1 MSA attributes
1832
1833 Methods for setting and reading the MSA attributes.
1834
1835 Note that the methods defining character semantics depend on the user
1836 to set them sensibly.  They are needed only by certain input/output
1837 methods. Unset them by setting to an empty string ('').
1838
1839 =head2 id
1840
1841  Title     : id
1842  Usage     : $myalign->id("Ig")
1843  Function  : Gets/sets the id field of the alignment
1844  Returns   : An id string
1845  Argument  : An id string (optional)
1846
1847 =cut
1848
1849 sub id {
1850     my ($self, $name) = @_;
1851
1852     if (defined( $name )) {
1853         $self->{'_id'} = $name;
1854     }
1855
1856     return $self->{'_id'};
1857 }
1858
1859 =head2 accession
1860
1861  Title     : accession
1862  Usage     : $myalign->accession("PF00244")
1863  Function  : Gets/sets the accession field of the alignment
1864  Returns   : An acc string
1865  Argument  : An acc string (optional)
1866
1867 =cut
1868
1869 sub accession {
1870     my ($self, $acc) = @_;
1871
1872     if (defined( $acc )) {
1873         $self->{'_accession'} = $acc;
1874     }
1875
1876     return $self->{'_accession'};
1877 }
1878
1879 =head2 description
1880
1881  Title     : description
1882  Usage     : $myalign->description("14-3-3 proteins")
1883  Function  : Gets/sets the description field of the alignment
1884  Returns   : An description string
1885  Argument  : An description string (optional)
1886
1887 =cut
1888
1889 sub description {
1890     my ($self, $name) = @_;
1891
1892     if (defined( $name )) {
1893         $self->{'_description'} = $name;
1894     }
1895
1896     return $self->{'_description'};
1897 }
1898
1899 =head2 missing_char
1900
1901  Title     : missing_char
1902  Usage     : $myalign->missing_char("?")
1903  Function  : Gets/sets the missing_char attribute of the alignment
1904              It is generally recommended to set it to 'n' or 'N'
1905              for nucleotides and to 'X' for protein.
1906  Returns   : An missing_char string,
1907  Argument  : An missing_char string (optional)
1908
1909 =cut
1910
1911 sub missing_char {
1912     my ($self, $char) = @_;
1913
1914     if (defined $char ) {
1915         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1916         $self->{'_missing_char'} = $char;
1917     }
1918
1919     return $self->{'_missing_char'};
1920 }
1921
1922 =head2 match_char
1923
1924  Title     : match_char
1925  Usage     : $myalign->match_char('.')
1926  Function  : Gets/sets the match_char attribute of the alignment
1927  Returns   : An match_char string,
1928  Argument  : An match_char string (optional)
1929
1930 =cut
1931
1932 sub match_char {
1933     my ($self, $char) = @_;
1934
1935     if (defined $char ) {
1936         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1937         $self->{'_match_char'} = $char;
1938     }
1939
1940     return $self->{'_match_char'};
1941 }
1942
1943 =head2 gap_char
1944
1945  Title     : gap_char
1946  Usage     : $myalign->gap_char('-')
1947  Function  : Gets/sets the gap_char attribute of the alignment
1948  Returns   : An gap_char string, defaults to '-'
1949  Argument  : An gap_char string (optional)
1950
1951 =cut
1952
1953 sub gap_char {
1954     my ($self, $char) = @_;
1955
1956     if (defined $char || ! defined $self->{'_gap_char'} ) {
1957         $char= '-' unless defined $char;
1958         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1959         $self->{'_gap_char'} = $char;
1960     }
1961     return $self->{'_gap_char'};
1962 }
1963
1964 =head2 symbol_chars
1965
1966  Title   : symbol_chars
1967  Usage   : my @symbolchars = $aln->symbol_chars;
1968  Function: Returns all the seen symbols (other than gaps)
1969  Returns : array of characters that are the seen symbols
1970  Args    : boolean to include the gap/missing/match characters
1971
1972 =cut
1973
1974 sub symbol_chars{
1975    my ($self,$includeextra) = @_;
1976
1977    unless ($self->{'_symbols'}) {
1978        foreach my $seq ($self->each_seq) {
1979            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1980        }
1981    }
1982    my %copy = %{$self->{'_symbols'}};
1983    if( ! $includeextra ) {
1984        foreach my $char ( $self->gap_char, $self->match_char,
1985                           $self->missing_char) {
1986            delete $copy{$char} if( defined $char );
1987        }
1988    }
1989    return keys %copy;
1990 }
1991
1992 =head1 Alignment descriptors
1993
1994 These read only methods describe the MSA in various ways.
1995
1996
1997 =head2 score
1998
1999  Title     : score
2000  Usage     : $str = $ali->score()
2001  Function  : get/set a score of the alignment
2002  Returns   : a score for the alignment
2003  Argument  : an optional score to set
2004
2005 =cut
2006
2007 sub score {
2008   my $self = shift;
2009   $self->{score} = shift if @_;
2010   return $self->{score};
2011 }
2012
2013 =head2 consensus_string
2014
2015  Title     : consensus_string
2016  Usage     : $str = $ali->consensus_string($threshold_percent)
2017  Function  : Makes a strict consensus
2018  Returns   : Consensus string
2019  Argument  : Optional treshold ranging from 0 to 100.
2020              The consensus residue has to appear at least threshold %
2021              of the sequences at a given location, otherwise a '?'
2022              character will be placed at that location.
2023              (Default value = 0%)
2024
2025 =cut
2026
2027 sub consensus_string {
2028     my $self = shift;
2029     my $threshold = shift;
2030
2031     my $out = "";
2032     my $len = $self->length - 1;
2033
2034     foreach ( 0 .. $len ) {
2035         $out .= $self->_consensus_aa($_,$threshold);
2036     }
2037     return $out;
2038 }
2039
2040 sub _consensus_aa {
2041     my $self = shift;
2042     my $point = shift;
2043     my $threshold_percent = shift || -1 ;
2044     my ($seq,%hash,$count,$letter,$key);
2045     my $gapchar = $self->gap_char;
2046     foreach $seq ( $self->each_seq() ) {
2047         $letter = substr($seq->seq,$point,1);
2048         $self->throw("--$point-----------") if $letter eq '';
2049         ($letter eq $gapchar || $letter =~ /\./) && next;
2050         # print "Looking at $letter\n";
2051         $hash{$letter}++;
2052     }
2053     my $number_of_sequences = $self->num_sequences();
2054     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2055     $count = -1;
2056     $letter = '?';
2057
2058     foreach $key ( sort keys %hash ) {
2059         # print "Now at $key $hash{$key}\n";
2060         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2061             $letter = $key;
2062             $count = $hash{$key};
2063         }
2064     }
2065     return $letter;
2066 }
2067
2068
2069 =head2 consensus_iupac
2070
2071  Title     : consensus_iupac
2072  Usage     : $str = $ali->consensus_iupac()
2073  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2074              and RNA. The output is in upper case except when gaps in
2075              a column force output to be in lower case.
2076
2077              Note that if your alignment sequences contain a lot of
2078              IUPAC ambiquity codes you often have to manually set
2079              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2080              indicate a protein sequence.
2081  Returns   : consensus string
2082  Argument  : none
2083  Throws    : on protein sequences
2084
2085 =cut
2086
2087 sub consensus_iupac {
2088     my $self = shift;
2089     my $out = "";
2090     my $len = $self->length-1;
2091
2092     # only DNA and RNA sequences are valid
2093     foreach my $seq ( $self->each_seq() ) {
2094         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2095             if $seq->alphabet eq 'protein';
2096     }
2097     # loop over the alignment columns
2098     foreach my $count ( 0 .. $len ) {
2099         $out .= $self->_consensus_iupac($count);
2100     }
2101     return $out;
2102 }
2103
2104 sub _consensus_iupac {
2105     my ($self, $column) = @_;
2106     my ($string, $char, $rna);
2107
2108     #determine all residues in a column
2109     foreach my $seq ( $self->each_seq() ) {
2110         $string .= substr($seq->seq, $column, 1);
2111     }
2112     $string = uc $string;
2113
2114     # quick exit if there's an N in the string
2115     if ($string =~ /N/) {
2116         $string =~ /\W/ ? return 'n' : return 'N';
2117     }
2118     # ... or if there are only gap characters
2119     return '-' if $string =~ /^\W+$/;
2120
2121     # treat RNA as DNA in regexps
2122     if ($string =~ /U/) {
2123         $string =~ s/U/T/;
2124         $rna = 1;
2125     }
2126
2127     # the following s///'s only need to be done to the _first_ ambiguity code
2128     # as we only need to see the _range_ of characters in $string
2129
2130     if ($string =~ /[VDHB]/) {
2131         $string =~ s/V/AGC/;
2132         $string =~ s/D/AGT/;
2133         $string =~ s/H/ACT/;
2134         $string =~ s/B/CTG/;
2135     }
2136
2137     if ($string =~ /[SKYRWM]/) {
2138         $string =~ s/S/GC/;
2139         $string =~ s/K/GT/;
2140         $string =~ s/Y/CT/;
2141         $string =~ s/R/AG/;
2142         $string =~ s/W/AT/;
2143         $string =~ s/M/AC/;
2144     }
2145
2146     # and now the guts of the thing
2147
2148     if ($string =~ /A/) {
2149         $char = 'A';                     # A                      A
2150         if ($string =~ /G/) {
2151             $char = 'R';                 # A and G (purines)      R
2152             if ($string =~ /C/) {
2153                 $char = 'V';             # A and G and C          V
2154                 if ($string =~ /T/) {
2155                     $char = 'N';         # A and G and C and T    N
2156                 }
2157             } elsif ($string =~ /T/) {
2158                 $char = 'D';             # A and G and T          D
2159             }
2160         } elsif ($string =~ /C/) {
2161             $char = 'M';                 # A and C                M
2162             if ($string =~ /T/) {
2163                 $char = 'H';             # A and C and T          H
2164             }
2165         } elsif ($string =~ /T/) {
2166             $char = 'W';                 # A and T                W
2167         }
2168     } elsif ($string =~ /C/) {
2169         $char = 'C';                     # C                      C
2170         if ($string =~ /T/) {
2171             $char = 'Y';                 # C and T (pyrimidines)  Y
2172             if ($string =~ /G/) {
2173                 $char = 'B';             # C and T and G          B
2174             }
2175         } elsif ($string =~ /G/) {
2176             $char = 'S';                 # C and G                S
2177         }
2178     } elsif ($string =~ /G/) {
2179         $char = 'G';                     # G                      G
2180         if ($string =~ /C/) {
2181             $char = 'S';                 # G and C                S
2182         } elsif ($string =~ /T/) {
2183             $char = 'K';                 # G and T                K
2184         }
2185     } elsif ($string =~ /T/) {
2186         $char = 'T';                     # T                      T
2187     }
2188
2189     $char = 'U' if $rna and $char eq 'T';
2190     $char = lc $char if $string =~ /\W/;
2191
2192     return $char;
2193 }
2194
2195
2196 =head2 consensus_meta
2197
2198  Title     : consensus_meta
2199  Usage     : $seqmeta = $ali->consensus_meta()
2200  Function  : Returns a Bio::Seq::Meta object containing the consensus
2201              strings derived from meta data analysis.
2202  Returns   : Bio::Seq::Meta
2203  Argument  : Bio::Seq::Meta
2204  Throws    : non-MetaI object
2205
2206 =cut
2207
2208 sub consensus_meta {
2209     my ($self, $meta) = @_;
2210     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2211         $self->throw('Not a Bio::Seq::MetaI object');
2212     }
2213     return $self->{'_aln_meta'} = $meta if $meta;
2214     return $self->{'_aln_meta'}
2215 }
2216
2217 =head2 is_flush
2218
2219  Title     : is_flush
2220  Usage     : if ( $ali->is_flush() )
2221  Function  : Tells you whether the alignment
2222            : is flush, i.e. all of the same length
2223  Returns   : 1 or 0
2224  Argument  :
2225
2226 =cut
2227
2228 sub is_flush {
2229     my ($self,$report) = @_;
2230     my $seq;
2231     my $length = (-1);
2232     my $temp;
2233
2234     foreach $seq ( $self->each_seq() ) {
2235         if( $length == (-1) ) {
2236             $length = CORE::length($seq->seq());
2237             next;
2238         }
2239
2240         $temp = CORE::length($seq->seq());
2241         if( $temp != $length ) {
2242             $self->warn("expecting $length not $temp from ".
2243                         $seq->display_id) if( $report );
2244             $self->debug("expecting $length not $temp from ".
2245                          $seq->display_id);
2246             $self->debug($seq->seq(). "\n");
2247             return 0;
2248         }
2249     }
2250
2251     return 1;
2252 }
2253
2254
2255 =head2 length
2256
2257  Title     : length()
2258  Usage     : $len = $ali->length()
2259  Function  : Returns the maximum length of the alignment.
2260              To be sure the alignment is a block, use is_flush
2261  Returns   : Integer
2262  Argument  :
2263
2264 =cut
2265
2266 sub length_aln {
2267     my $self = shift;
2268     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2269     $self->length(@_);
2270 }
2271
2272 sub length {
2273     my $self = shift;
2274     my $seq;
2275     my $length = -1;
2276     my $temp;
2277
2278     foreach $seq ( $self->each_seq() ) {
2279         $temp = $seq->length();
2280         if( $temp > $length ) {
2281             $length = $temp;
2282         }
2283     }
2284
2285     return $length;
2286 }
2287
2288
2289 =head2 maxdisplayname_length
2290
2291  Title     : maxdisplayname_length
2292  Usage     : $ali->maxdisplayname_length()
2293  Function  : Gets the maximum length of the displayname in the
2294              alignment. Used in writing out various MSA formats.
2295  Returns   : integer
2296  Argument  :
2297
2298 =cut
2299
2300 sub maxname_length {
2301     my $self = shift;
2302     $self->deprecated("maxname_length - deprecated method.".
2303                       " Use maxdisplayname_length() instead.");
2304     $self->maxdisplayname_length();
2305 }
2306
2307 sub maxnse_length {
2308     my $self = shift;
2309     $self->deprecated("maxnse_length - deprecated method.".
2310                       " Use maxnse_length() instead.");
2311     $self->maxdisplayname_length();
2312 }
2313
2314 sub maxdisplayname_length {
2315     my $self = shift;
2316     my $maxname = (-1);
2317     my ($seq,$len);
2318
2319     foreach $seq ( $self->each_seq() ) {
2320         $len = CORE::length $self->displayname($seq->get_nse());
2321
2322         if( $len > $maxname ) {
2323             $maxname = $len;
2324         }
2325     }
2326
2327     return $maxname;
2328 }
2329
2330 =head2 max_metaname_length
2331
2332  Title     : max_metaname_length
2333  Usage     : $ali->max_metaname_length()
2334  Function  : Gets the maximum length of the meta name tags in the
2335              alignment for the sequences and for the alignment.
2336              Used in writing out various MSA formats.
2337  Returns   : integer
2338  Argument  : None
2339
2340 =cut
2341
2342 sub max_metaname_length {
2343     my $self = shift;
2344     my $maxname = (-1);
2345     my ($seq,$len);
2346
2347     # check seq meta first
2348     for $seq ( $self->each_seq() ) {
2349         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2350         for my $mtag ($seq->meta_names) {
2351             $len = CORE::length $mtag;
2352             if( $len > $maxname ) {
2353                 $maxname = $len;
2354             }
2355         }
2356     }
2357
2358     # alignment meta
2359     for my $meta ($self->consensus_meta) {
2360         next unless $meta;
2361         for my $name ($meta->meta_names) {
2362             $len = CORE::length $name;
2363             if( $len > $maxname ) {
2364                 $maxname = $len;
2365             }
2366         }
2367     }
2368
2369     return $maxname;
2370 }
2371
2372 =head2 num_residues
2373
2374  Title     : num_residues
2375  Usage     : $no = $ali->num_residues
2376  Function  : number of residues in total in the alignment
2377  Returns   : integer
2378  Argument  :
2379  Note      : replaces no_residues()
2380
2381 =cut
2382
2383 sub num_residues {
2384     my $self = shift;
2385     my $count = 0;
2386
2387     foreach my $seq ($self->each_seq) {
2388         my $str = $seq->seq();
2389
2390         $count += ($str =~ s/[A-Za-z]//g);
2391     }
2392
2393     return $count;
2394 }
2395
2396 =head2 num_sequences
2397
2398  Title     : num_sequences
2399  Usage     : $depth = $ali->num_sequences
2400  Function  : number of sequence in the sequence alignment
2401  Returns   : integer
2402  Argument  : none
2403  Note      : replaces no_sequences()
2404
2405 =cut
2406
2407 sub num_sequences {
2408     my $self = shift;
2409     return scalar($self->each_seq);
2410 }
2411
2412
2413 =head2 average_percentage_identity
2414
2415  Title   : average_percentage_identity
2416  Usage   : $id = $align->average_percentage_identity
2417  Function: The function uses a fast method to calculate the average
2418            percentage identity of the alignment
2419  Returns : The average percentage identity of the alignment
2420  Args    : None
2421  Notes   : This method implemented by Kevin Howe calculates a figure that is
2422            designed to be similar to the average pairwise identity of the
2423            alignment (identical in the absence of gaps), without having to
2424            explicitly calculate pairwise identities proposed by Richard Durbin.
2425            Validated by Ewan Birney ad Alex Bateman.
2426
2427 =cut
2428
2429 sub average_percentage_identity{
2430    my ($self,@args) = @_;
2431
2432    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2433                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2434
2435    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2436
2437    if (! $self->is_flush()) {
2438        $self->throw("All sequences in the alignment must be the same length");
2439    }
2440
2441    @seqs = $self->each_seq();
2442    $len = $self->length();
2443
2444    # load the each hash with correct keys for existence checks
2445
2446    for( my $index=0; $index < $len; $index++) {
2447        foreach my $letter (@alphabet) {
2448            $countHashes[$index]->{$letter} = 0;
2449        }
2450    }
2451    foreach my $seq (@seqs)  {
2452        my @seqChars = split //, $seq->seq();
2453        for( my $column=0; $column < @seqChars; $column++ ) {
2454            my $char = uc($seqChars[$column]);
2455            if (exists $countHashes[$column]->{$char}) {
2456                $countHashes[$column]->{$char}++;
2457            }
2458        }
2459    }
2460
2461    $total = 0;
2462    $divisor = 0;
2463    for(my $column =0; $column < $len; $column++) {
2464        my %hash = %{$countHashes[$column]};
2465        $subdivisor = 0;
2466        foreach my $res (keys %hash) {
2467            $total += $hash{$res}*($hash{$res} - 1);
2468            $subdivisor += $hash{$res};
2469        }
2470        $divisor += $subdivisor * ($subdivisor - 1);
2471    }
2472    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2473 }
2474
2475 =head2 percentage_identity
2476
2477  Title   : percentage_identity
2478  Usage   : $id = $align->percentage_identity
2479  Function: The function calculates the average percentage identity
2480            (aliased to average_percentage_identity)
2481  Returns : The average percentage identity
2482  Args    : None
2483
2484 =cut
2485
2486 sub percentage_identity {
2487     my $self = shift;
2488     return $self->average_percentage_identity();
2489 }
2490
2491 =head2 overall_percentage_identity
2492
2493  Title   : overall_percentage_identity
2494  Usage   : $id = $align->overall_percentage_identity
2495            $id = $align->overall_percentage_identity('short')
2496  Function: The function calculates the percentage identity of
2497            the conserved columns
2498  Returns : The percentage identity of the conserved columns
2499  Args    : length value to use, optional defaults to alignment length
2500                  possible values: 'align', 'short', 'long'
2501
2502 The argument values 'short' and 'long' refer to shortest and longest
2503 sequence in the alignment. Method modification code by Hongyu Zhang.
2504
2505 =cut
2506
2507 sub overall_percentage_identity{
2508    my ($self, $length_measure) = @_;
2509
2510    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2511
2512    my %enum = map {$_ => undef} qw (align short long);
2513
2514    $self->throw("Unknown argument [$length_measure]")
2515        if $length_measure and not exists $enum{$length_measure};
2516    $length_measure ||= 'align';
2517
2518    if (! $self->is_flush()) {
2519        $self->throw("All sequences in the alignment must be the same length");
2520    }
2521
2522    # Count the residues seen at each position
2523    my $len;
2524    my $total = 0; # number of positions with identical residues
2525    my @countHashes;
2526    my @seqs = $self->each_seq;
2527    my $nof_seqs = scalar @seqs;
2528    my $aln_len = $self->length();
2529    for my $seq (@seqs)  {
2530        my $seqstr = $seq->seq;
2531
2532        # Count residues for given sequence
2533        for my $column (0 .. $aln_len-1) {
2534            my $char = uc( substr($seqstr, $column, 1) );
2535            if ( exists $alphabet{$char} ) {
2536
2537                # This is a valid char
2538                if ( defined $countHashes[$column]->{$char} ) {
2539                  $countHashes[$column]->{$char}++;
2540                } else {
2541                  $countHashes[$column]->{$char} = 1;
2542                }
2543
2544                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2545                    # All sequences have this same residue
2546                    $total++;
2547                }
2548
2549            }
2550        }
2551
2552        # Sequence length
2553        if ($length_measure eq 'short' || $length_measure eq 'long') {
2554            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2555            if ($length_measure eq 'short') {
2556                if ( (not defined $len) || ($seq_len < $len) ) {
2557                    $len = $seq_len;
2558                }
2559            } elsif ($length_measure eq 'long') {
2560                if ( (not defined $len) || ($seq_len > $len) ) {
2561                    $len = $seq_len;
2562                }
2563            }
2564        }
2565
2566    }
2567
2568    if ($length_measure eq 'align') {
2569        $len = $aln_len;
2570    }
2571
2572    return ($total / $len ) * 100.0;
2573 }
2574
2575
2576
2577 =head1 Alignment positions
2578
2579 Methods to map a sequence position into an alignment column and back.
2580 column_from_residue_number() does the former. The latter is really a
2581 property of the sequence object and can done using
2582 L<Bio::LocatableSeq::location_from_column>:
2583
2584     # select somehow a sequence from the alignment, e.g.
2585     my $seq = $aln->get_seq_by_pos(1);
2586     #$loc is undef or Bio::LocationI object
2587     my $loc = $seq->location_from_column(5);
2588
2589 =head2 column_from_residue_number
2590
2591  Title   : column_from_residue_number
2592  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2593  Function: This function gives the position in the alignment
2594            (i.e. column number) of the given residue number in the
2595            sequence with the given name. For example, for the
2596            alignment
2597
2598              Seq1/91-97 AC..DEF.GH.
2599              Seq2/24-30 ACGG.RTY...
2600                 Seq3/43-51 AC.DDEF.GHI
2601
2602            column_from_residue_number( "Seq1", 94 ) returns 6.
2603            column_from_residue_number( "Seq2", 25 ) returns 2.
2604            column_from_residue_number( "Seq3", 50 ) returns 10.
2605
2606            An exception is thrown if the residue number would lie
2607            outside the length of the aligment
2608            (e.g. column_from_residue_number( "Seq2", 22 )
2609
2610           Note: If the the parent sequence is represented by more than
2611                 one alignment sequence and the residue number is present in
2612                 them, this method finds only the first one.
2613
2614  Returns : A column number for the position in the alignment of the
2615            given residue in the given sequence (1 = first column)
2616  Args    : A sequence id/name (not a name/start-end)
2617            A residue number in the whole sequence (not just that
2618            segment of it in the alignment)
2619
2620 =cut
2621
2622 sub column_from_residue_number {
2623     my ($self, $name, $resnumber) = @_;
2624
2625     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2626     $self->throw("Second argument residue number missing") unless $resnumber;
2627
2628     foreach my $seq ($self->each_seq_with_id($name)) {
2629         my $col;
2630         eval {
2631             $col = $seq->column_from_residue_number($resnumber);
2632         };
2633         next if $@;
2634         return $col;
2635     }
2636
2637     $self->throw("Could not find a sequence segment in $name ".
2638                  "containing residue number $resnumber");
2639
2640 }
2641
2642 =head1 Sequence names
2643
2644 Methods to manipulate the display name. The default name based on the
2645 sequence id and subsequence positions can be overridden in various
2646 ways.
2647
2648 =head2 displayname
2649
2650  Title     : displayname
2651  Usage     : $myalign->displayname("Ig", "IgA")
2652  Function  : Gets/sets the display name of a sequence in the alignment
2653  Returns   : A display name string
2654  Argument  : name of the sequence
2655              displayname of the sequence (optional)
2656
2657 =cut
2658
2659 sub get_displayname {
2660     my $self = shift;
2661     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2662     $self->displayname(@_);
2663 }
2664
2665 sub set_displayname {
2666     my $self = shift;
2667     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2668     $self->displayname(@_);
2669 }
2670
2671 sub displayname {
2672     my ($self, $name, $disname) = @_;
2673
2674     $self->throw("No sequence with name [$name]")
2675         unless defined $self->{'_seq'}->{$name};
2676
2677     if(  $disname and  $name) {
2678         $self->{'_dis_name'}->{$name} = $disname;
2679         return $disname;
2680     }
2681     elsif( defined $self->{'_dis_name'}->{$name} ) {
2682         return  $self->{'_dis_name'}->{$name};
2683     } else {
2684         return $name;
2685     }
2686 }
2687
2688 =head2 set_displayname_count
2689
2690  Title     : set_displayname_count
2691  Usage     : $ali->set_displayname_count
2692  Function  : Sets the names to be name_# where # is the number of
2693              times this name has been used.
2694  Returns   : 1, on success
2695  Argument  :
2696
2697 =cut
2698
2699 sub set_displayname_count {
2700     my $self= shift;
2701     my (@arr,$name,$seq,$count,$temp,$nse);
2702
2703     foreach $seq ( $self->each_alphabetically() ) {
2704         $nse = $seq->get_nse();
2705
2706         #name will be set when this is the second
2707         #time (or greater) is has been seen
2708
2709         if( defined $name and $name eq ($seq->id()) ) {
2710             $temp = sprintf("%s_%s",$name,$count);
2711             $self->displayname($nse,$temp);
2712             $count++;
2713         } else {
2714             $count = 1;
2715             $name = $seq->id();
2716             $temp = sprintf("%s_%s",$name,$count);
2717             $self->displayname($nse,$temp);
2718             $count++;
2719         }
2720     }
2721     return 1;
2722 }
2723
2724 =head2 set_displayname_flat
2725
2726  Title     : set_displayname_flat
2727  Usage     : $ali->set_displayname_flat()
2728  Function  : Makes all the sequences be displayed as just their name,
2729              not name/start-end
2730  Returns   : 1
2731  Argument  :
2732
2733 =cut
2734
2735 sub set_displayname_flat {
2736     my $self = shift;
2737     my ($nse,$seq);
2738
2739     foreach $seq ( $self->each_seq() ) {
2740         $nse = $seq->get_nse();
2741         $self->displayname($nse,$seq->id());
2742     }
2743     return 1;
2744 }
2745
2746 =head2 set_displayname_normal
2747
2748  Title     : set_displayname_normal
2749  Usage     : $ali->set_displayname_normal()
2750  Function  : Makes all the sequences be displayed as name/start-end
2751  Returns   : 1, on success
2752  Argument  :
2753
2754 =cut
2755
2756 sub set_displayname_normal {
2757     my $self = shift;
2758     my ($nse,$seq);
2759
2760     foreach $seq ( $self->each_seq() ) {
2761         $nse = $seq->get_nse();
2762         $self->displayname($nse,$nse);
2763     }
2764     return 1;
2765 }
2766
2767 =head2 source
2768
2769  Title   : source
2770  Usage   : $obj->source($newval)
2771  Function: sets the Alignment source program
2772  Example :
2773  Returns : value of source
2774  Args    : newvalue (optional)
2775
2776
2777 =cut
2778
2779 sub source{
2780    my ($self,$value) = @_;
2781    if( defined $value) {
2782       $self->{'_source'} = $value;
2783     }
2784     return $self->{'_source'};
2785 }
2786
2787 =head2 set_displayname_safe
2788
2789  Title     : set_displayname_safe
2790  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2791  Function  : Assign machine-generated serial names to sequences in input order.
2792              Designed to protect names during PHYLIP runs. Assign 10-char string
2793              in the form of "S000000001" to "S999999999". Restore the original
2794              names using "restore_displayname".
2795  Returns   : 1. a new $aln with system names;
2796              2. a hash ref for restoring names
2797  Argument  : Number for id length (default 10)
2798
2799 =cut
2800
2801 sub set_displayname_safe {
2802     my $self = shift;
2803     my $idlength = shift || 10;
2804     my ($seq, %phylip_name);
2805     my $ct=0;
2806     my $new=Bio::SimpleAlign->new();
2807     foreach $seq ( $self->each_seq() ) {
2808         $ct++;
2809         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2810         $phylip_name{$pname}=$seq->id();
2811         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2812                                             -seq      => $seq->seq(),
2813                                             -alphabet => $seq->alphabet,
2814                                             -start    => $seq->{_start},
2815                                             -end      => $seq->{_end}
2816                                             );
2817         $new->add_seq($new_seq);
2818     }
2819
2820     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2821     return ($new, \%phylip_name);
2822 }
2823
2824 =head2 restore_displayname
2825
2826  Title     : restore_displayname
2827  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2828  Function  : Restore original sequence names (after running
2829              $ali->set_displayname_safe)
2830  Returns   : a new $aln with names restored.
2831  Argument  : a hash reference of names from "set_displayname_safe".
2832
2833 =cut
2834
2835 sub restore_displayname {
2836     my $self = shift;
2837     my $ref=shift;
2838     my %name=%$ref;
2839     my $new=Bio::SimpleAlign->new();
2840     foreach my $seq ( $self->each_seq() ) {
2841       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2842       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2843                                           -seq      => $seq->seq(),
2844                                           -alphabet => $seq->alphabet,
2845                                           -start    => $seq->{_start},
2846                                           -end      => $seq->{_end}
2847                                           );
2848       $new->add_seq($new_seq);
2849     }
2850     return $new;
2851 }
2852
2853 =head2 sort_by_start
2854
2855  Title     : sort_by_start
2856  Usage     : $ali->sort_by_start
2857  Function  : Changes the order of the alignment to the start position of each
2858              subalignment
2859  Returns   :
2860  Argument  :
2861
2862 =cut
2863
2864 sub sort_by_start {
2865     my $self = shift;
2866     my ($seq,$nse,@arr,%hash,$count);
2867     foreach $seq ( $self->each_seq() ) {
2868         $nse = $seq->get_nse;
2869         $hash{$nse} = $seq;
2870     }
2871     $count = 0;
2872     %{$self->{'_order'}} = (); # reset the hash;
2873     foreach $nse ( sort _startend keys %hash) {
2874         $self->{'_order'}->{$count} = $nse;
2875         $count++;
2876     }
2877     1;
2878 }
2879
2880 sub _startend
2881 {
2882     my ($aname,$arange) = split (/[\/]/,$a);
2883     my ($bname,$brange) = split (/[\/]/,$b);
2884     my ($astart,$aend) = split(/\-/,$arange);
2885     my ($bstart,$bend) = split(/\-/,$brange);
2886     return $astart <=> $bstart;
2887 }
2888
2889 =head2 bracket_string
2890
2891  Title     : bracket_string
2892  Usage     : my @params = (-refseq     => 'testseq',
2893                            -allele1    => 'allele1',
2894                            -allele2    => 'allele2',
2895                            -delimiters => '{}',
2896                            -separator  => '/');
2897              $str = $aln->bracket_string(@params)
2898
2899  Function :  When supplied with a list of parameters (see below), returns a
2900              string in BIC format. This is used for allelic comparisons.
2901              Briefly, if either allele contains a base change when compared to
2902              the refseq, the base or gap for each allele is represented in
2903              brackets in the order present in the 'alleles' parameter.
2904
2905              For the following data:
2906
2907              >testseq
2908              GGATCCATTGCTACT
2909              >allele1
2910              GGATCCATTCCTACT
2911              >allele2
2912              GGAT--ATTCCTCCT
2913
2914              the returned string with parameters 'refseq => testseq' and
2915              'alleles => [qw(allele1 allele2)]' would be:
2916
2917              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2918  Returns   : BIC-formatted string
2919  Argument  : Required args
2920                 refseq    : string (ID) of the reference sequence used
2921                             as basis for comparison
2922                 allele1   : string (ID) of the first allele
2923                 allele2   : string (ID) of the second allele
2924              Optional args
2925                 delimiters: two symbol string of left and right delimiters.
2926                             Only the first two symbols are used
2927                             default = '[]'
2928                 separator : string used as a separator.  Only the first
2929                             symbol is used
2930                             default = '/'
2931  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2932
2933 =cut
2934
2935 sub bracket_string {
2936     my ($self, @args) = @_;
2937     my ($ref, $a1, $a2, $delim, $sep) =
2938         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2939     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2940     my ($ld, $rd);
2941     ($ld, $rd) = split('', $delim, 2) if $delim;
2942     $ld ||= '[';
2943     $rd ||= ']';
2944     $sep ||= '/';
2945     my ($refseq, $allele1, $allele2) =
2946         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2947     if (!$refseq || !$allele1 || !$allele2) {
2948         $self->throw("One of your refseq/allele IDs is invalid!");
2949     }
2950     my $len = $self->length-1;
2951     my $bic = '';
2952     # loop over the alignment columns
2953     for my $column ( 0 .. $len ) {
2954         my $string;
2955         my ($compres, $res1, $res2) =
2956             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2957         # are any of the allele symbols different from the refseq?
2958         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2959                 $ld.$res1.$sep.$res2.$rd;
2960         $bic .= $string;
2961     }
2962     return $bic;
2963 }
2964
2965
2966 =head2 methods implementing Bio::FeatureHolderI
2967
2968 FeatureHolderI implementation to support labeled character sets like one
2969 would get from NEXUS represented data.
2970
2971 =head2 get_SeqFeatures
2972
2973  Usage   : @features = $aln->get_SeqFeatures
2974  Function: Get the feature objects held by this feature holder.
2975  Example :
2976  Returns : an array of Bio::SeqFeatureI implementing objects
2977  Args    : optional filter coderef, taking a Bio::SeqFeatureI
2978          : as argument, returning TRUE if wanted, FALSE if
2979          : unwanted
2980
2981 =cut
2982
2983 sub get_SeqFeatures {
2984     my $self = shift;
2985     my $filter_cb = shift;
2986     $self->throw("Arg (filter callback) must be a coderef") unless
2987         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2988     if( !defined $self->{'_as_feat'} ) {
2989         $self->{'_as_feat'} = [];
2990     }
2991     if ($filter_cb) {
2992         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2993     }
2994     return @{$self->{'_as_feat'}};
2995 }
2996
2997 =head2 add_SeqFeature
2998
2999  Usage   : $aln->add_SeqFeature($subfeat);
3000  Function: adds a SeqFeature into the SeqFeature array.
3001  Example :
3002  Returns : true on success
3003  Args    : a Bio::SeqFeatureI object
3004  Note    : This implementation is not compliant
3005            with Bio::FeatureHolderI
3006
3007 =cut
3008
3009 sub add_SeqFeature {
3010    my ($self,@feat) = @_;
3011
3012    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3013
3014    foreach my $feat ( @feat ) {
3015        if( !$feat->isa("Bio::SeqFeatureI") ) {
3016            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3017        }
3018
3019        push(@{$self->{'_as_feat'}},$feat);
3020    }
3021    return 1;
3022 }
3023
3024
3025 =head2 remove_SeqFeatures
3026
3027  Usage   : $obj->remove_SeqFeatures
3028  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3029            remove that subset from the returned array, and add back the rest.
3030  Returns : The array of Bio::SeqFeatureI features that was
3031            deleted from this alignment.
3032  Args    : none
3033
3034 =cut
3035
3036 sub remove_SeqFeatures {
3037     my $self = shift;
3038
3039     return () unless $self->{'_as_feat'};
3040     my @feats = @{$self->{'_as_feat'}};
3041     $self->{'_as_feat'} = [];
3042     return @feats;
3043 }
3044
3045 =head2 feature_count
3046
3047  Title   : feature_count
3048  Usage   : $obj->feature_count()
3049  Function: Return the number of SeqFeatures attached to the alignment
3050  Returns : integer representing the number of SeqFeatures
3051  Args    : None
3052
3053 =cut
3054
3055 sub feature_count {
3056     my ($self) = @_;
3057
3058     if (defined($self->{'_as_feat'})) {
3059         return ($#{$self->{'_as_feat'}} + 1);
3060     } else {
3061         return 0;
3062     }
3063 }
3064
3065 =head2 get_all_SeqFeatures
3066
3067  Title   : get_all_SeqFeatures
3068  Usage   :
3069  Function: Get all SeqFeatures.
3070  Example :
3071  Returns : an array of Bio::SeqFeatureI implementing objects
3072  Args    : none
3073  Note    : Falls through to Bio::FeatureHolderI implementation.
3074
3075 =cut
3076
3077 =head2 methods for Bio::AnnotatableI
3078
3079 AnnotatableI implementation to support sequence alignments which
3080 contain annotation (NEXUS, Stockholm).
3081
3082 =head2 annotation
3083
3084  Title   : annotation
3085  Usage   : $ann = $aln->annotation or
3086            $aln->annotation($ann)
3087  Function: Gets or sets the annotation
3088  Returns : Bio::AnnotationCollectionI object
3089  Args    : None or Bio::AnnotationCollectionI object
3090
3091 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3092 for more information
3093
3094 =cut
3095
3096 sub annotation {
3097     my ($obj,$value) = @_;
3098     if( defined $value ) {
3099         $obj->throw("object of class ".ref($value)." does not implement ".
3100                 "Bio::AnnotationCollectionI. Too bad.")
3101             unless $value->isa("Bio::AnnotationCollectionI");
3102         $obj->{'_annotation'} = $value;
3103     } elsif( ! defined $obj->{'_annotation'}) {
3104         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3105     }
3106     return $obj->{'_annotation'};
3107 }
3108
3109 =head1 Deprecated methods
3110
3111 =cut
3112
3113 =head2 no_residues
3114
3115  Title     : no_residues
3116  Usage     : $no = $ali->no_residues
3117  Function  : number of residues in total in the alignment
3118  Returns   : integer
3119  Argument  :
3120  Note      : deprecated in favor of num_residues()
3121
3122 =cut
3123
3124 sub no_residues {
3125         my $self = shift;
3126         $self->deprecated(-warn_version => 1.0069,
3127                                           -throw_version => 1.0075,
3128                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3129     $self->num_residues(@_);
3130 }
3131
3132 =head2 no_sequences
3133
3134  Title     : no_sequences
3135  Usage     : $depth = $ali->no_sequences
3136  Function  : number of sequence in the sequence alignment
3137  Returns   : integer
3138  Argument  :
3139  Note      : deprecated in favor of num_sequences()
3140
3141 =cut
3142
3143 sub no_sequences {
3144         my $self = shift;
3145         $self->deprecated(-warn_version => 1.0069,
3146                                           -throw_version => 1.0075,
3147                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3148     $self->num_sequences(@_);
3149 }
3150
3151 =head2 mask_columns
3152
3153  Title     : mask_columns
3154  Usage     : $aln2 = $aln->mask_columns(20,30)
3155  Function  : Masks a slice of the alignment inclusive of start and
3156              end columns, and the first column in the alignment is denoted 1.
3157              Mask beyond the length of the sequence does not do padding.
3158  Returns   : A Bio::SimpleAlign object
3159  Args      : Positive integer for start column, positive integer for end column,
3160              optional string value use for the mask. Example:
3161
3162              $aln2 = $aln->mask_columns(20,30,'?')
3163  Note      : Masking must use a character that is not used for gaps or
3164              frameshifts.  These can be adjusted using the relevant global
3165              variables, but be aware these may be (uncontrollably) modified
3166              elsewhere within BioPerl (see bug 2715)
3167
3168 =cut
3169
3170 sub mask_columns {
3171     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3172     my $self = shift;
3173
3174     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3175              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3176
3177     # coordinates are alignment-based, not sequence-based
3178     my ($start, $end, $mask_char) = @_;
3179     unless (defined $mask_char) { $mask_char = 'N' }
3180
3181     $self->throw("Mask start has to be a positive integer and less than ".
3182                  "alignment length, not [$start]")
3183       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3184     $self->throw("Mask end has to be a positive integer and less than ".
3185                  "alignment length, not [$end]")
3186       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3187     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3188                  "end [$end]") unless $start <= $end;
3189     $self->throw("Mask character $mask_char has to be a single character ".
3190                  "and not a gap or frameshift symbol")
3191       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3192
3193     my $aln = $self->new;
3194     $aln->id($self->id);
3195     foreach my $seq ( $self->each_seq() ) {
3196         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3197          -alphabet => $seq->alphabet,
3198          -strand  => $seq->strand,
3199          -verbose => $self->verbose);
3200
3201         # convert from 1-based alignment coords!
3202         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3203         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3204         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3205         $new_seq->seq($new_dna_string);
3206         $aln->add_seq($new_seq);
3207     }
3208     return $aln;
3209 }
3210
3211
3212
3213 1;