Bio/SimpleAlign.pm

   1 # $Id$
   2 # BioPerl module for SimpleAlign
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13 #
  14 #  History:
  15 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  16 #       May 2001 major rewrite - Heikki Lehvaslaiho
  17
  18 =head1 NAME
  19
  20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  21
  22 =head1 SYNOPSIS
  23
  24   # Use Bio::AlignIO to read in the alignment
  25   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  26   $aln = $str->next_aln();
  27
  28   # Describe
  29   print $aln->length;
  30   print $aln->num_residues;
  31   print $aln->is_flush;
  32   print $aln->num_sequences;
  33   print $aln->score;
  34   print $aln->percentage_identity;
  35   print $aln->consensus_string(50);
  36
  37   # Find the position in the alignment for a sequence location
  38   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  39
  40   # Extract sequences and check values for the alignment column $pos
  41   foreach $seq ($aln->each_seq) {
  42       $res = $seq->subseq($pos, $pos);
  43       $count{$res}++;
  44   }
  45   foreach $res (keys %count) {
  46       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  47   }
  48
  49   # Manipulate
  50   $aln->remove_seq($seq);
  51   $mini_aln = $aln->slice(20,30);  # get a block of columns
  52   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  53   $new_aln = $aln->remove_columns([20,30]); # remove by position
  54   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  55
  56   # Analyze
  57   $str = $aln->consensus_string($threshold_percent);
  58   $str = $aln->match_line();
  59   $str = $aln->cigar_line();
  60   $id = $aln->percentage_identity;
  61
  62   # See the module documentation for details and more methods.
  63
  64 =head1 DESCRIPTION
  65
  66 SimpleAlign is an object that handles a multiple sequence alignment
  67 (MSA). It is very permissive of types (it does not insist on sequences
  68 being all same length, for example). Think of it as a set of sequences
  69 with a whole series of built-in manipulations and methods for reading and
  70 writing alignments.
  71
  72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  73 to store its sequences. These are subsequences with a start and end
  74 positions in the parent reference sequence. Each sequence in the
  75 SimpleAlign object is a Bio::LocatableSeq.
  76
  77 SimpleAlign expects the combination of name, start, and end for a
  78 given sequence to be unique in the alignment, and this is the key for the
  79 internal hashes (name, start, end are abbreviated C<nse> in the code).
  80 However, in some cases people do not want the name/start-end to be displayed:
  81 either multiple names in an alignment or names specific to the alignment
  82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  83 C<displayname>, and generally is what is used to print out the
  84 alignment. They default to name/start-end.
  85
  86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  87
  88 =head1 FEEDBACK
  89
  90 =head2 Mailing Lists
  91
  92 User feedback is an integral part of the evolution of this and other
  93 Bioperl modules.  Send your comments and suggestions preferably to one
  94 of the Bioperl mailing lists.  Your participation is much appreciated.
  95
  96   bioperl-l@bioperl.org                  - General discussion
  97   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  98
  99 =head2 Support
 100
 101 Please direct usage questions or support issues to the mailing list:
 102
 103 L<bioperl-l@bioperl.org>
 104
 105 rather than to the module maintainer directly. Many experienced and
 106 reponsive experts will be able look at the problem and quickly
 107 address it. Please include a thorough description of the problem
 108 with code and data examples if at all possible.
 109
 110 =head2 Reporting Bugs
 111
 112 Report bugs to the Bioperl bug tracking system to help us keep track
 113 the bugs and their resolution. Bug reports can be submitted via the
 114 web:
 115
 116   http://bugzilla.open-bio.org/
 117
 118 =head1 AUTHOR
 119
 120 Ewan Birney, birney@ebi.ac.uk
 121
 122 =head1 CONTRIBUTORS
 123
 124 Allen Day, allenday-at-ucla.edu,
 125 Richard Adams, Richard.Adams-at-ed.ac.uk,
 126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 128 Allen Smith, allens-at-cpan.org,
 129 Jason Stajich, jason-at-bioperl.org,
 130 Anthony Underwood, aunderwood-at-phls.org.uk,
 131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 132 Brian Osborne, bosborne at alum.mit.edu
 133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 134 Hongyu Zhang, forward at hongyu.org
 135 Jay Hannah, jay at jays.net
 136 Alexandr Bezginov, albezg at gmail.com
 137
 138 =head1 SEE ALSO
 139
 140 L<Bio::LocatableSeq>
 141
 142 =head1 APPENDIX
 143
 144 The rest of the documentation details each of the object
 145 methods. Internal methods are usually preceded with a _
 146
 147 =cut
 148
 149 # 'Let the code begin...
 150
 151 package Bio::SimpleAlign;
 152 use vars qw(%CONSERVATION_GROUPS);
 153 use strict;
 154
 155 use Bio::LocatableSeq;  # uses Seq's as list
 156
 157 use Bio::Seq;
 158 use Bio::SeqFeature::Generic;
 159
 160 BEGIN {
 161     # This data should probably be in a more centralized module...
 162     # it is taken from Clustalw documentation.
 163     # These are all the positively scoring groups that occur in the
 164     # Gonnet Pam250 matrix. The strong and weak groups are
 165     # defined as strong score >0.5 and weak score =<0.5 respectively.
 166
 167     %CONSERVATION_GROUPS = (
 168             'strong' => [ qw(
 169                                                  STA
 170                                                  NEQK
 171                                                  NHQK
 172                                                  NDEQ
 173                                                  QHRK
 174                                                  MILV
 175                                                  MILF
 176                                                  HY
 177                                                  FYW )],
 178                                 'weak' => [ qw(
 179                       CSA
 180                                                ATV
 181                                                SAG
 182                                                STNK
 183                                                STPA
 184                                                SGND
 185                                                SNDEQK
 186                                                NDEQHK
 187                                                NEQHRK
 188                                                FVLIM
 189                                                HFY )],);
 190 }
 191
 192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 193             Bio::FeatureHolderI);
 194
 195 =head2 new
 196
 197  Title     : new
 198  Usage     : my $aln = Bio::SimpleAlign->new();
 199  Function  : Creates a new simple align object
 200  Returns   : Bio::SimpleAlign
 201  Args      : -source     => string representing the source program
 202                             where this alignment came from
 203              -annotation => Bio::AnnotationCollectionI
 204              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 205              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 206              -consensus  => consensus string
 207              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 208
 209 =cut
 210
 211
 212 sub new {
 213   my($class,@args) = @_;
 214
 215   my $self = $class->SUPER::new(@args);
 216
 217   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 218                                             SOURCE
 219                                             SCORE
 220                                             ID
 221                                             ACCESSION
 222                                             DESCRIPTION
 223                                             SEQS
 224                                             FEATURES
 225                                             ANNOTATION
 226                                             SEQ_ANNOTATION
 227                                             CONSENSUS
 228                                             CONSENSUS_META
 229                                             )], @args);
 230   $src && $self->source($src);
 231   defined $score && $self->score($score);
 232   # we need to set up internal hashs first!
 233
 234   $self->{'_seq'} = {};
 235   $self->{'_order'} = {};
 236   $self->{'_start_end_lists'} = {};
 237   $self->{'_dis_name'} = {};
 238   $self->{'_id'} = 'NoName';
 239   # maybe we should automatically read in from args. Hmmm...
 240   $id  && $self->id($id);
 241   $acc && $self->accession($acc);
 242   $desc && $self->description($desc);
 243   $coll && $self->annotation($coll);
 244   # sequence annotation is layered into a provided annotation collection (or dies)
 245   if ($sa) {
 246     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 247                  "with a sequence annotation collection")
 248         if !$coll;
 249     $coll->add_Annotation('seq_annotation', $sa);
 250   }
 251   if ($feats && ref $feats eq 'ARRAY') {
 252     for my $feat (@$feats) {
 253         $self->add_SeqFeature($feat);
 254     }
 255   }
 256   $con && $self->consensus($con);
 257   $cmeta && $self->consensus_meta($cmeta);
 258   # assumes these are in correct alignment order
 259   if ($seqs && ref($seqs) eq 'ARRAY') {
 260     for my $seq (@$seqs) {
 261         $self->add_seq($seq);
 262     }
 263   }
 264
 265   return $self; # success - we hope!
 266 }
 267
 268 =head1 Modifier methods
 269
 270 These methods modify the MSA by adding, removing or shuffling complete
 271 sequences.
 272
 273 =head2 add_seq
 274
 275  Title     : add_seq
 276  Usage     : $myalign->add_seq($newseq);
 277              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 278  Function  : Adds another sequence to the alignment. *Does not* align
 279              it - just adds it to the hashes.
 280              If -ORDER is specified, the sequence is inserted at the
 281              the position spec'd by -ORDER, and existing sequences
 282              are pushed down the storage array.
 283  Returns   : nothing
 284  Args      : A Bio::LocatableSeq object
 285              Positive integer for the sequence position (optional)
 286
 287 See L<Bio::LocatableSeq> for more information
 288
 289 =cut
 290
 291 sub addSeq {
 292     my $self = shift;
 293     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 294     $self->add_seq(@_);
 295 }
 296
 297 sub add_seq {
 298     my $self = shift;
 299     my @args = @_;
 300     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 301     my ($name,$id,$start,$end);
 302
 303     unless ($seq) {
 304         $self->throw("LocatableSeq argument required");
 305     }
 306     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 307         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 308     }
 309     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 310     $order--; # jay's patch (user-specified order is 1-origin)
 311
 312     if ($order < 0) {
 313         $self->throw("User-specified value for ORDER must be >= 1");
 314     }
 315
 316     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 317
 318     # build the symbol list for this sequence,
 319     # will prune out the gap and missing/match chars
 320     # when actually asked for the symbol list in the
 321     # symbol_chars
 322     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 323
 324     $name = $seq->get_nse;
 325
 326     if( $self->{'_seq'}->{$name} ) {
 327         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 328     }
 329     else {
 330         $self->debug( "Assigning $name to $order\n");
 331
 332     my $ordh = $self->{'_order'};
 333     if ($ordh->{$order}) {
 334         # make space to insert
 335         # $c->() returns (in reverse order) the first subsequence
 336         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 337         # (3,2,1), and $c->(2,4,5) returns (2).
 338         my $c;
 339         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 340         map {
 341      $ordh->{$_+1} = $ordh->{$_}
 342         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 343
 344     }
 345     $ordh->{$order} = $name;
 346
 347         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 348             $self->{'_start_end_lists'}->{$id} = [];
 349         }
 350         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 351     }
 352
 353     $self->{'_seq'}->{$name} = $seq;
 354
 355 }
 356
 357
 358 =head2 remove_seq
 359
 360  Title     : remove_seq
 361  Usage     : $aln->remove_seq($seq);
 362  Function  : Removes a single sequence from an alignment
 363  Returns   :
 364  Argument  : a Bio::LocatableSeq object
 365
 366 =cut
 367
 368 sub removeSeq {
 369     my $self = shift;
 370     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 371     $self->remove_seq(@_);
 372 }
 373
 374 sub remove_seq {
 375     my $self = shift;
 376     my $seq = shift;
 377     my ($name,$id,$start,$end);
 378
 379     $self->throw("Need Bio::Locatable seq argument ")
 380         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 381
 382     $id = $seq->id();
 383     $start = $seq->start();
 384     $end  = $seq->end();
 385     $name = sprintf("%s/%d-%d",$id,$start,$end);
 386
 387     if( !exists $self->{'_seq'}->{$name} ) {
 388         $self->throw("Sequence $name does not exist in the alignment to remove!");
 389     }
 390
 391     delete $self->{'_seq'}->{$name};
 392
 393     # we need to remove this seq from the start_end_lists hash
 394
 395     if (exists $self->{'_start_end_lists'}->{$id}) {
 396         # we need to find the sequence in the array.
 397
 398         my ($i, $found);;
 399         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 400             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 401                 $found = 1;
 402                 last;
 403             }
 404         }
 405         if ($found) {
 406             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 407         }
 408         else {
 409             $self->throw("Could not find the sequence to remoce from the start-end list");
 410         }
 411     }
 412     else {
 413         $self->throw("There is no seq list for the name $id");
 414     }
 415     # we need to shift order hash
 416     my %rev_order = reverse %{$self->{'_order'}};
 417     my $no = $rev_order{$name};
 418     my $num_sequences = $self->num_sequences;
 419     for (; $no < $num_sequences; $no++) {
 420        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 421     }
 422     delete $self->{'_order'}->{$no};
 423     return 1;
 424 }
 425
 426
 427 =head2 purge
 428
 429  Title   : purge
 430  Usage   : $aln->purge(0.7);
 431  Function: Removes sequences above given sequence similarity
 432            This function will grind on large alignments. Beware!
 433  Example :
 434  Returns : An array of the removed sequences
 435  Args    : float, threshold for similarity
 436
 437 =cut
 438
 439 sub purge {
 440         my ($self,$perc) = @_;
 441         my (%duplicate, @dups);
 442
 443         my @seqs = $self->each_seq();
 444
 445         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 446                 my $seq = $seqs[$i];
 447
 448                 #skip if already in duplicate hash
 449                 next if exists $duplicate{$seq->display_id} ;
 450                 my $one = $seq->seq();
 451
 452                 my @one = split '', $one;       #split to get 1aa per array element
 453
 454                 for (my $j=$i+1;$j < @seqs;$j++) {
 455                         my $seq2 = $seqs[$j];
 456
 457                         #skip if already in duplicate hash
 458                         next if exists $duplicate{$seq2->display_id} ;
 459
 460                         my $two = $seq2->seq();
 461                         my @two = split '', $two;
 462
 463                         my $count = 0;
 464                         my $res = 0;
 465                         for (my $k=0;$k<@one;$k++) {
 466                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 467                                           $one[$k] eq $two[$k]) {
 468                                         $count++;
 469                                 }
 470                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 471                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 472                                         $res++;
 473                                 }
 474                         }
 475
 476                         my $ratio = 0;
 477                         $ratio = $count/$res unless $res == 0;
 478
 479                         # if above threshold put in duplicate hash and push onto
 480                         # duplicate array for returning to get_unique
 481                         if ( $ratio > $perc ) {
 482                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 483                                 $duplicate{$seq2->display_id} = 1;
 484                                 push @dups, $seq2;
 485                         }
 486                 }
 487         }
 488         foreach my $seq (@dups) {
 489                 $self->remove_seq($seq);
 490         }
 491         return @dups;
 492 }
 493
 494 =head2 sort_alphabetically
 495
 496  Title     : sort_alphabetically
 497  Usage     : $ali->sort_alphabetically
 498  Function  : Changes the order of the alignment to alphabetical on name
 499              followed by numerical by number.
 500  Returns   :
 501  Argument  :
 502
 503 =cut
 504
 505 sub sort_alphabetically {
 506     my $self = shift;
 507     my ($seq,$nse,@arr,%hash,$count);
 508
 509     foreach $seq ( $self->each_seq() ) {
 510         $nse = $seq->get_nse;
 511         $hash{$nse} = $seq;
 512     }
 513
 514     $count = 0;
 515
 516     %{$self->{'_order'}} = (); # reset the hash;
 517
 518     foreach $nse ( sort _alpha_startend keys %hash) {
 519         $self->{'_order'}->{$count} = $nse;
 520
 521         $count++;
 522     }
 523     1;
 524 }
 525
 526 =head2 sort_by_list
 527
 528  Title     : sort_by_list
 529  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 530  Function  : Arbitrarily order sequences in an alignment
 531  Returns   : A new Bio::SimpleAlign object
 532  Argument  : a file listing sequence names in intended order (one name per line)
 533
 534 =cut
 535
 536 sub sort_by_list {
 537     my ($self, $list) = @_;
 538     my (@seq, @ids, %order);
 539
 540     foreach my $seq ( $self->each_seq() ) {
 541         push @seq, $seq;
 542         push @ids, $seq->display_id;
 543     }
 544
 545     my $ct=1;
 546     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 547     while (<$listfh>) {
 548       chomp;
 549       my $name=$_;
 550       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 551       $order{$name}=$ct++;
 552     }
 553     close($listfh);
 554
 555     # use the map-sort-map idiom:
 556     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 557     my $aln = $self->new;
 558     foreach (@sorted) { $aln->add_seq($_) }
 559     return $aln;
 560 }
 561
 562 =head2 set_new_reference
 563
 564  Title     : set_new_reference
 565  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 566              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 567              as the reference (1st) sequence
 568  Function  : Change/Set a new reference (i.e., the first) sequence
 569  Returns   : a new Bio::SimpleAlign object.
 570              Throws an exception if designated sequence not found
 571  Argument  : a positive integer of sequence order, or a sequence name
 572              in the original alignment
 573
 574 =cut
 575
 576 sub set_new_reference {
 577     my ($self, $seqid) = @_;
 578     my $aln = $self->new;
 579     my (@seq, @ids, @new_seq);
 580     my $is_num=0;
 581     foreach my $seq ( $self->each_seq() ) {
 582         push @seq, $seq;
 583         push @ids, $seq->display_id;
 584     }
 585
 586     if ($seqid =~ /^\d+$/) { # argument is seq position
 587         $is_num=1;
 588         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 589     } else { # argument is a seq name
 590         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 591     }
 592
 593     for (my $i=0; $i<=$#seq; $i++) {
 594         my $pos=$i+1;
 595         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 596             unshift @new_seq, $seq[$i];
 597         } else {
 598             push @new_seq, $seq[$i];
 599         }
 600     }
 601     foreach (@new_seq) { $aln->add_seq($_);  }
 602     return $aln;
 603 }
 604
 605 sub _in_aln {  # check if input name exists in the alignment
 606     my ($str, $ref) = @_;
 607     foreach (@$ref) {
 608         return 1 if $str eq $_;
 609     }
 610     return 0;
 611 }
 612
 613
 614 =head2 uniq_seq
 615
 616  Title     : uniq_seq
 617  Usage     : $aln->uniq_seq():  Remove identical sequences in
 618              in the alignment.  Ambiguous base ("N", "n") and
 619              leading and ending gaps ("-") are NOT counted as
 620              differences.
 621  Function  : Make a new alignment of unique sequence types (STs)
 622  Returns   : 1a. if called in a scalar context,
 623                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 624              1b. if called in an array context,
 625                 a new Bio::SimpleAlign object, and a hashref whose keys
 626                 are sequence types, and whose values are arrayrefs to
 627                 lists of sequence ids within the corresponding sequence type
 628              2. if $aln->verbose > 0, ST of each sequence is sent to
 629                 STDERR (in a tabular format)
 630  Argument  : None
 631
 632 =cut
 633
 634 sub uniq_seq {
 635     my ($self, $seqid) = @_;
 636     my $aln = $self->new;
 637     my (%member, %order, @seq, @uniq_str, $st);
 638     my $order=0;
 639     my $len = $self->length();
 640     $st = {};
 641     foreach my $seq ( $self->each_seq() ) {
 642         my $str = $seq->seq();
 643
 644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 645 # comparing two sequence strings
 646
 647     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 648         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 649     # 2nd, convert leading and ending gaps to "?":
 650         $str = &_convert_leading_ending_gaps($str, '-', '?');
 651     # Note that '?' also can mean unknown residue.
 652     # I don't like making global class member changes like this, too
 653     # prone to errors... -- cjfields 08-11-18
 654     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 655         my $new = Bio::LocatableSeq->new(
 656                      -id      => $seq->id(),
 657                                          -alphabet=> $seq->alphabet,
 658                                          -seq     => $str,
 659                                          -start   => $seq->start,
 660                                          -end     => $seq->end
 661                                          );
 662         push @seq, $new;
 663     }
 664
 665     foreach my $seq (@seq) {
 666         my $str = $seq->seq();
 667         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 668         if ($seen) { # seen before
 669             my @memb = @{$member{$key}};
 670             push @memb, $seq;
 671             $member{$key} = \@memb;
 672         } else {  # not seen
 673             push @uniq_str, $key;
 674             $order++;
 675             $member{$key} = [ ($seq) ];
 676             $order{$key} = $order;
 677         }
 678     }
 679
 680     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 682         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 684         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 685         my $gap='-';
 686         my $end=length($str2);
 687         $end -= length($1) while $str2 =~ m/($gap+)/g;
 688         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 689                                          -seq  =>$str2,
 690                                          -start=>1,
 691                                          -end  =>$end
 692                                          );
 693         $aln->add_seq($new);
 694         foreach (@{$member{$str}}) {
 695             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 696         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 697         }
 698     }
 699     return wantarray ? ($aln, $st) : $aln;
 700 }
 701
 702 sub _check_uniq {  # check if same seq exists in the alignment
 703     my ($str1, $ref, $length) = @_;
 704     my @char1=split //, $str1;
 705     my @array=@$ref;
 706
 707     return (0, $str1) if @array==0; # not seen (1st sequence)
 708
 709     foreach my $str2 (@array) {
 710         my $diff=0;
 711         my @char2=split //, $str2;
 712         for (my $i=0; $i<=$length-1; $i++) {
 713             next if $char1[$i] eq '?';
 714             next if $char2[$i] eq '?';
 715             $diff++ if $char1[$i] ne $char2[$i];
 716         }
 717         return (1, $str2) if $diff == 0;  # seen before
 718     }
 719
 720     return (0, $str1); # not seen
 721 }
 722
 723 sub _convert_leading_ending_gaps {
 724     my $s=shift;
 725     my $sym1=shift;
 726     my $sym2=shift;
 727     my @array=split //, $s;
 728 # convert leading char:
 729     for (my $i=0; $i<=$#array; $i++) {
 730         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 731     }
 732 # convert ending char:
 733     for (my $i = $#array; $i>= 0; $i--) {
 734         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 735     }
 736     my $s_new=join '', @array;
 737     return $s_new;
 738 }
 739
 740 =head1 Sequence selection methods
 741
 742 Methods returning one or more sequences objects.
 743
 744 =head2 each_seq
 745
 746  Title     : each_seq
 747  Usage     : foreach $seq ( $align->each_seq() )
 748  Function  : Gets a Seq object from the alignment
 749  Returns   : Seq object
 750  Argument  :
 751
 752 =cut
 753
 754 sub eachSeq {
 755     my $self = shift;
 756     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 757     $self->each_seq();
 758 }
 759
 760 sub each_seq {
 761         my $self = shift;
 762         my (@arr,$order);
 763
 764         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 765                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 766                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 767                 }
 768         }
 769         return @arr;
 770 }
 771
 772
 773 =head2 each_alphabetically
 774
 775  Title     : each_alphabetically
 776  Usage     : foreach $seq ( $ali->each_alphabetically() )
 777  Function  : Returns a sequence object, but the objects are returned
 778              in alphabetically sorted order.
 779              Does not change the order of the alignment.
 780  Returns   : Seq object
 781  Argument  :
 782
 783 =cut
 784
 785 sub each_alphabetically {
 786         my $self = shift;
 787         my ($seq,$nse,@arr,%hash,$count);
 788
 789         foreach $seq ( $self->each_seq() ) {
 790                 $nse = $seq->get_nse;
 791                 $hash{$nse} = $seq;
 792         }
 793
 794         foreach $nse ( sort _alpha_startend keys %hash) {
 795                 push(@arr,$hash{$nse});
 796         }
 797         return @arr;
 798 }
 799
 800 sub _alpha_startend {
 801     my ($aname,$astart,$bname,$bstart);
 802     ($aname,$astart) = split (/-/,$a);
 803     ($bname,$bstart) = split (/-/,$b);
 804
 805     if( $aname eq $bname ) {
 806         return $astart <=> $bstart;
 807     }
 808     else {
 809         return $aname cmp $bname;
 810     }
 811 }
 812
 813 =head2 each_seq_with_id
 814
 815  Title     : each_seq_with_id
 816  Usage     : foreach $seq ( $align->each_seq_with_id() )
 817  Function  : Gets a Seq objects from the alignment, the contents
 818              being those sequences with the given name (there may be
 819              more than one)
 820  Returns   : Seq object
 821  Argument  : a seq name
 822
 823 =cut
 824
 825 sub eachSeqWithId {
 826     my $self = shift;
 827     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 828     $self->each_seq_with_id(@_);
 829 }
 830
 831 sub each_seq_with_id {
 832     my $self = shift;
 833     my $id = shift;
 834
 835     $self->throw("Method each_seq_with_id needs a sequence name argument")
 836         unless defined $id;
 837
 838     my (@arr, $seq);
 839
 840     if (exists($self->{'_start_end_lists'}->{$id})) {
 841         @arr = @{$self->{'_start_end_lists'}->{$id}};
 842     }
 843     return @arr;
 844 }
 845
 846 =head2 get_seq_by_pos
 847
 848  Title     : get_seq_by_pos
 849  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 850  Function  : Gets a sequence based on its position in the alignment.
 851              Numbering starts from 1.  Sequence positions larger than
 852              num_sequences() will thow an error.
 853  Returns   : a Bio::LocatableSeq object
 854  Args      : positive integer for the sequence position
 855
 856 =cut
 857
 858 sub get_seq_by_pos {
 859
 860     my $self = shift;
 861     my ($pos) = @_;
 862
 863     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 864         unless $pos =~ /^\d+$/ and $pos > 0;
 865     $self->throw("No sequence at position [$pos]")
 866         unless $pos <= $self->num_sequences ;
 867
 868     my $nse = $self->{'_order'}->{--$pos};
 869     return $self->{'_seq'}->{$nse};
 870 }
 871
 872 =head2 get_seq_by_id
 873
 874  Title     : get_seq_by_id
 875  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 876  Function  : Gets a sequence based on its name.
 877              Sequences that do not exist will warn and return undef
 878  Returns   : a Bio::LocatableSeq object
 879  Args      : string for sequence name
 880
 881 =cut
 882
 883 sub get_seq_by_id {
 884     my ($self,$name) = @_;
 885     unless( defined $name ) {
 886       $self->warn("Must provide a sequence name");
 887       return;
 888     }
 889     for my $seq ( values %{$self->{'_seq'}} ) {
 890       if ( $seq->id eq $name) {
 891         return $seq;
 892       }
 893     }
 894     return;
 895 }
 896
 897 =head2 seq_with_features
 898
 899  Title   : seq_with_features
 900  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 901                                           -consensus => 60
 902                                           -mask =>
 903            sub { my $consensus = shift;
 904
 905                  for my $i (1..5){
 906                     my $n = 'N' x $i;
 907                     my $q = '\?' x $i;
 908                     while($consensus =~ /[^?]$q[^?]/){
 909                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 910                     }
 911                   }
 912                  return $consensus;
 913                }
 914                                          );
 915  Function: produces a Bio::Seq object by first splicing gaps from -pos
 916            (by means of a splice_by_seq_pos() call), then creating
 917            features using non-? chars (by means of a consensus_string()
 918            call with stringency -consensus).
 919  Returns : a Bio::Seq object
 920  Args    : -pos : required. sequence from which to build the Bio::Seq
 921              object
 922            -consensus : optional, defaults to consensus_string()'s
 923              default cutoff value
 924            -mask : optional, a coderef to apply to consensus_string()'s
 925              output before building features.  this may be useful for
 926              closing gaps of 1 bp by masking over them with N, for
 927              instance
 928
 929 =cut
 930
 931 sub seq_with_features{
 932    my ($self,%arg) = @_;
 933
 934    #first do the preparatory splice
 935    $self->throw("must provide a -pos argument") unless $arg{-pos};
 936    $self->splice_by_seq_pos($arg{-pos});
 937
 938    my $consensus_string = $self->consensus_string($arg{-consensus});
 939    $consensus_string = $arg{-mask}->($consensus_string)
 940          if defined($arg{-mask});
 941
 942    my(@bs,@es);
 943
 944    push @bs, 1 if $consensus_string =~ /^[^?]/;
 945
 946    while($consensus_string =~ /\?[^?]/g){
 947          push @bs, pos($consensus_string);
 948    }
 949    while($consensus_string =~ /[^?]\?/g){
 950          push @es, pos($consensus_string);
 951    }
 952
 953    push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
 954
 955    my $seq = Bio::Seq->new();
 956
 957 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 958 #                -source_tag => 'location',
 959 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 960 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 961 #                                                  );
 962 #   $seq->add_SeqFeature($rootfeature);
 963
 964    while(my $b = shift @bs){
 965          my $e = shift @es;
 966          $seq->add_SeqFeature(
 967        Bio::SeqFeature::Generic->new(
 968          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 969          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 970          -source_tag => $self->source || 'MSA',
 971        )
 972      );
 973    }
 974
 975    return $seq;
 976 }
 977
 978
 979 =head1 Create new alignments
 980
 981 The result of these methods are horizontal or vertical subsets of the
 982 current MSA.
 983
 984 =head2 select
 985
 986  Title     : select
 987  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 988  Function  : Creates a new alignment from a continuous subset of
 989              sequences.  Numbering starts from 1.  Sequence positions
 990              larger than num_sequences() will thow an error.
 991  Returns   : a Bio::SimpleAlign object
 992  Args      : positive integer for the first sequence
 993              positive integer for the last sequence to include (optional)
 994
 995 =cut
 996
 997 sub select {
 998     my $self = shift;
 999     my ($start, $end) = @_;
1000
1001     $self->throw("Select start has to be a positive integer, not [$start]")
1002         unless $start =~ /^\d+$/ and $start > 0;
1003     $self->throw("Select end has to be a positive integer, not [$end]")
1004         unless $end  =~ /^\d+$/ and $end > 0;
1005     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006         unless $start <= $end;
1007
1008     my $aln = $self->new;
1009     foreach my $pos ($start .. $end) {
1010         $aln->add_seq($self->get_seq_by_pos($pos));
1011     }
1012     $aln->id($self->id);
1013     # fix for meta, sf, ann
1014     return $aln;
1015 }
1016
1017 =head2 select_noncont
1018
1019  Title     : select_noncont
1020  Usage     : # 1st and 3rd sequences, sorted
1021              $aln2 = $aln->select_noncont(1, 3)
1022
1023              # 1st and 3rd sequences, sorted (same as first)
1024              $aln2 = $aln->select_noncont(3, 1)
1025
1026              # 1st and 3rd sequences, unsorted
1027              $aln2 = $aln->select_noncont('nosort',3, 1)
1028
1029  Function  : Creates a new alignment from a subset of sequences.  Numbering
1030              starts from 1.  Sequence positions larger than num_sequences() will
1031              throw an error.  Sorts the order added to new alignment by default,
1032              to prevent sorting pass 'nosort' as the first argument in the list.
1033  Returns   : a Bio::SimpleAlign object
1034  Args      : array of integers for the sequences.  If the string 'nosort' is
1035              passed as the first argument, the sequences will not be sorted
1036              in the new alignment but will appear in the order listed.
1037
1038 =cut
1039
1040 sub select_noncont {
1041         my $self = shift;
1042     my $nosort = 0;
1043         my (@pos) = @_;
1044     if ($pos[0] !~ m{^\d+$}) {
1045         my $sortcmd = shift @pos;
1046         if ($sortcmd eq 'nosort') {
1047             $nosort = 1;
1048         } else {
1049             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1050         }
1051     }
1052
1053     my $end = $self->num_sequences;
1054     foreach ( @pos ) {
1055                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1057         }
1058
1059         @pos = sort {$a <=> $b} @pos unless $nosort;
1060
1061         my $aln = $self->new;
1062         foreach my $p (@pos) {
1063                 $aln->add_seq($self->get_seq_by_pos($p));
1064         }
1065         $aln->id($self->id);
1066     # fix for meta, sf, ann
1067         return $aln;
1068 }
1069
1070 =head2 slice
1071
1072  Title     : slice
1073  Usage     : $aln2 = $aln->slice(20,30)
1074  Function  : Creates a slice from the alignment inclusive of start and
1075              end columns, and the first column in the alignment is denoted 1.
1076              Sequences with no residues in the slice are excluded from the
1077              new alignment and a warning is printed. Slice beyond the length of
1078              the sequence does not do padding.
1079  Returns   : A Bio::SimpleAlign object
1080  Args      : Positive integer for start column, positive integer for end column,
1081              optional boolean which if true will keep gap-only columns in the newly
1082              created slice. Example:
1083
1084              $aln2 = $aln->slice(20,30,1)
1085
1086 =cut
1087
1088 sub slice {
1089         my $self = shift;
1090         my ($start, $end, $keep_gap_only) = @_;
1091
1092         $self->throw("Slice start has to be a positive integer, not [$start]")
1093           unless $start =~ /^\d+$/ and $start > 0;
1094         $self->throw("Slice end has to be a positive integer, not [$end]")
1095           unless $end =~ /^\d+$/ and $end > 0;
1096         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097           unless $start <= $end;
1098         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099                                          "[$start] is too big.") if $start > $self->length;
1100     my $cons_meta = $self->consensus_meta;
1101         my $aln = $self->new;
1102         $aln->id($self->id);
1103         foreach my $seq ( $self->each_seq() ) {
1104             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105             Bio::Seq::Meta->new
1106         (-id      => $seq->id,
1107                  -alphabet => $seq->alphabet,
1108                  -strand  => $seq->strand,
1109                  -verbose => $self->verbose) :
1110             Bio::LocatableSeq->new
1111         (-id      => $seq->id,
1112                  -alphabet => $seq->alphabet,
1113                  -strand  => $seq->strand,
1114                  -verbose => $self->verbose);
1115
1116             # seq
1117             my $seq_end = $end;
1118             $seq_end = $seq->length if( $end > $seq->length );
1119
1120             my $slice_seq = $seq->subseq($start, $seq_end);
1121             $new_seq->seq( $slice_seq );
1122
1123             $slice_seq =~ s/\W//g;
1124
1125             if ($start > 1) {
1126             my $pre_start_seq = $seq->subseq(1, $start - 1);
1127             $pre_start_seq =~ s/\W//g;
1128             if (!defined($seq->strand)) {
1129                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1134             }
1135             } else {
1136             $new_seq->start( $seq->start);
1137             }
1138         if ($new_seq->isa('Bio::Seq::MetaI')) {
1139             for my $meta_name ($seq->meta_names) {
1140                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1141             }
1142         }
1143             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1144
1145             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1146             $aln->add_seq($new_seq);
1147             } else {
1148             if( $keep_gap_only ) {
1149                 $aln->add_seq($new_seq);
1150             } else {
1151                 my $nse = $seq->get_nse();
1152                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1153                     " Sequence excluded from the new alignment.");
1154             }
1155             }
1156         }
1157     if ($cons_meta) {
1158         my $new = Bio::Seq::Meta->new();
1159         for my $meta_name ($cons_meta->meta_names) {
1160             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1161         }
1162         $aln->consensus_meta($new);
1163     }
1164     $aln->annotation($self->annotation);
1165     # fix for meta, sf, ann
1166         return $aln;
1167 }
1168
1169 =head2 remove_columns
1170
1171  Title     : remove_columns
1172  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1173              $aln2 = $aln->remove_columns([0,0],[6,8])
1174  Function  : Creates an aligment with columns removed corresponding to
1175              the specified type or by specifying the columns by number.
1176  Returns   : Bio::SimpleAlign object
1177  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1178              'all_gaps_columns') or array ref where the referenced array
1179              contains a pair of integers that specify a range.
1180              The first column is 0
1181 =cut
1182
1183 sub remove_columns {
1184     my ($self,@args) = @_;
1185     @args || $self->throw("Must supply column ranges or column types");
1186     my $aln;
1187
1188     if ($args[0][0] =~ /^[a-z_]+$/i) {
1189         $aln = $self->_remove_columns_by_type($args[0]);
1190     } elsif ($args[0][0] =~ /^\d+$/) {
1191         $aln = $self->_remove_columns_by_num(\@args);
1192     } else {
1193         $self->throw("You must pass array references to remove_columns(), not @args");
1194     }
1195     # fix for meta, sf, ann
1196     $aln;
1197 }
1198
1199
1200 =head2 remove_gaps
1201
1202  Title     : remove_gaps
1203  Usage     : $aln2 = $aln->remove_gaps
1204  Function  : Creates an aligment with gaps removed
1205  Returns   : a Bio::SimpleAlign object
1206  Args      : a gap character(optional) if none specified taken
1207                 from $self->gap_char,
1208              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1209                         indicates that only all-gaps columns should be deleted
1210
1211 Used from method L<remove_columns> in most cases. Set gap character
1212 using L<gap_char()|gap_char>.
1213
1214 =cut
1215
1216 sub remove_gaps {
1217     my ($self,$gapchar,$all_gaps_columns) = @_;
1218     my $gap_line;
1219     if ($all_gaps_columns) {
1220         $gap_line = $self->all_gap_line($gapchar);
1221     } else {
1222         $gap_line = $self->gap_line($gapchar);
1223     }
1224     my $aln = $self->new;
1225
1226     my @remove;
1227     my $length = 0;
1228     my $del_char = $gapchar || $self->gap_char;
1229     # Do the matching to get the segments to remove
1230     while ($gap_line =~ m/[$del_char]/g) {
1231         my $start = pos($gap_line)-1;
1232         $gap_line=~/\G[$del_char]+/gc;
1233         my $end = pos($gap_line)-1;
1234
1235         #have to offset the start and end for subsequent removes
1236         $start-=$length;
1237         $end  -=$length;
1238         $length += ($end-$start+1);
1239         push @remove, [$start,$end];
1240     }
1241
1242     #remove the segments
1243     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1244     # fix for meta, sf, ann
1245     return $aln;
1246 }
1247
1248
1249 sub _remove_col {
1250     my ($self,$aln,$remove) = @_;
1251     my @new;
1252
1253     my $gap = $self->gap_char;
1254
1255     # splice out the segments and create new seq
1256     foreach my $seq($self->each_seq){
1257         my $new_seq = Bio::LocatableSeq->new(
1258                                              -id      => $seq->id,
1259                                              -alphabet=> $seq->alphabet,
1260                                              -strand  => $seq->strand,
1261                                              -verbose => $self->verbose);
1262         my $sequence = $seq->seq;
1263         foreach my $pair(@{$remove}){
1264             my $start = $pair->[0];
1265             my $end   = $pair->[1];
1266             $sequence = $seq->seq unless $sequence;
1267             my $orig = $sequence;
1268             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1269             my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1270             $sequence = $head.$tail;
1271             # start
1272             unless (defined $new_seq->start) {
1273                 if ($start == 0) {
1274                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1275                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1276                 }
1277                 else {
1278                     my $start_adjust = $orig =~ /^$gap+/;
1279                     if ($start_adjust) {
1280                         $start_adjust = $+[0] == $start;
1281                     }
1282                     $new_seq->start($seq->start + $start_adjust);
1283                 }
1284             }
1285             # end
1286             if (($end + 1) >= length($orig)) {
1287                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1288                 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1289             }
1290             else {
1291                 $new_seq->end($seq->end);
1292             }
1293         }
1294
1295         if ($new_seq->end < $new_seq->start) {
1296             # we removed all columns except for gaps: set to 0 to indicate no
1297             # sequence
1298             $new_seq->start(0);
1299             $new_seq->end(0);
1300         }
1301
1302         $new_seq->seq($sequence) if $sequence;
1303                 push @new, $new_seq;
1304     }
1305     # add the new seqs to the alignment
1306     foreach my $new(@new){
1307         $aln->add_seq($new);
1308     }
1309     # fix for meta, sf, ann
1310     return $aln;
1311 }
1312
1313 sub _remove_columns_by_type {
1314         my ($self,$type) = @_;
1315         my $aln = $self->new;
1316         my @remove;
1317
1318         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1319         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1320         my %matchchars = ( 'match'           => '\*',
1321                        'weak'             => '\.',
1322                        'strong'           => ':',
1323                        'mismatch'         => ' ',
1324                        'gaps'             => '',
1325                        'all_gaps_columns' => ''
1326                      );
1327         # get the characters to delete against
1328         my $del_char;
1329         foreach my $type (@{$type}){
1330                 $del_char.= $matchchars{$type};
1331         }
1332
1333         my $length = 0;
1334         my $match_line = $self->match_line;
1335         # do the matching to get the segments to remove
1336         if($del_char){
1337                 while($match_line =~ m/[$del_char]/g ){
1338                         my $start = pos($match_line)-1;
1339                         $match_line=~/\G[$del_char]+/gc;
1340                         my $end = pos($match_line)-1;
1341
1342                         #have to offset the start and end for subsequent removes
1343                         $start-=$length;
1344                         $end  -=$length;
1345                         $length += ($end-$start+1);
1346                         push @remove, [$start,$end];
1347                 }
1348         }
1349
1350         # remove the segments
1351         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1352         $aln = $aln->remove_gaps() if $gap;
1353         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1354     # fix for meta, sf, ann
1355         $aln;
1356 }
1357
1358
1359 sub _remove_columns_by_num {
1360         my ($self,$positions) = @_;
1361         my $aln = $self->new;
1362
1363         # sort the positions
1364         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1365
1366     my @remove;
1367     my $length = 0;
1368     foreach my $pos (@{$positions}) {
1369         my ($start, $end) = @{$pos};
1370
1371         #have to offset the start and end for subsequent removes
1372         $start-=$length;
1373         $end  -=$length;
1374         $length += ($end-$start+1);
1375         push @remove, [$start,$end];
1376     }
1377
1378     #remove the segments
1379     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1380     # fix for meta, sf, ann
1381         $aln;
1382 }
1383
1384
1385 =head1 Change sequences within the MSA
1386
1387 These methods affect characters in all sequences without changing the
1388 alignment.
1389
1390 =head2 splice_by_seq_pos
1391
1392  Title   : splice_by_seq_pos
1393  Usage   : $status = splice_by_seq_pos(1);
1394  Function: splices all aligned sequences where the specified sequence
1395            has gaps.
1396  Example :
1397  Returns : 1 on success
1398  Args    : position of sequence to splice by
1399
1400
1401 =cut
1402
1403 sub splice_by_seq_pos{
1404   my ($self,$pos) = @_;
1405
1406   my $guide = $self->get_seq_by_pos($pos);
1407   my $guide_seq = $guide->seq;
1408
1409   $guide_seq =~ s/\./\-/g;
1410
1411   my @gaps = ();
1412   $pos = -1;
1413   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1414     unshift @gaps, $pos;
1415     $pos++;
1416   }
1417
1418   foreach my $seq ($self->each_seq){
1419     my @bases = split '', $seq->seq;
1420
1421     splice(@bases, $_, 1) foreach @gaps;
1422     $seq->seq(join('', @bases));
1423   }
1424
1425   1;
1426 }
1427
1428 =head2 map_chars
1429
1430  Title     : map_chars
1431  Usage     : $ali->map_chars('\.','-')
1432  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1433              characters
1434
1435              Notice that the from (arg1) is interpretted as a regex,
1436              so be careful about quoting meta characters (eg
1437              $ali->map_chars('.','-') wont do what you want)
1438  Returns   :
1439  Argument  : 'from' rexexp
1440              'to' string
1441
1442 =cut
1443
1444 sub map_chars {
1445     my $self = shift;
1446     my $from = shift;
1447     my $to   = shift;
1448     my ($seq,$temp);
1449
1450     $self->throw("Need exactly two arguments")
1451         unless defined $from and defined $to;
1452
1453     foreach $seq ( $self->each_seq() ) {
1454         $temp = $seq->seq();
1455         $temp =~ s/$from/$to/g;
1456         $seq->seq($temp);
1457     }
1458     return 1;
1459 }
1460
1461
1462 =head2 uppercase
1463
1464  Title     : uppercase()
1465  Usage     : $ali->uppercase()
1466  Function  : Sets all the sequences to uppercase
1467  Returns   :
1468  Argument  :
1469
1470 =cut
1471
1472 sub uppercase {
1473     my $self = shift;
1474     my $seq;
1475     my $temp;
1476
1477     foreach $seq ( $self->each_seq() ) {
1478       $temp = $seq->seq();
1479       $temp =~ tr/[a-z]/[A-Z]/;
1480
1481       $seq->seq($temp);
1482     }
1483     return 1;
1484 }
1485
1486 =head2 cigar_line
1487
1488  Title    : cigar_line()
1489  Usage    : %cigars = $align->cigar_line()
1490  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1491             Report) line for each sequence in the alignment. Examples are
1492             "1,60" or "5,10:12,58", where the numbers refer to conserved
1493             positions within the alignment. The keys of the hash are the
1494             NSEs (name/start/end) assigned to each sequence.
1495  Args     : threshold (optional, defaults to 100)
1496  Returns  : Hash of strings (cigar lines)
1497
1498 =cut
1499
1500 sub cigar_line {
1501         my $self = shift;
1502         my $thr=shift||100;
1503         my %cigars;
1504
1505         my @consensus = split "",($self->consensus_string($thr));
1506         my $len = $self->length;
1507         my $gapchar = $self->gap_char;
1508
1509         # create a precursor, something like (1,4,5,6,7,33,45),
1510         # where each number corresponds to a conserved position
1511         foreach my $seq ( $self->each_seq ) {
1512                 my @seq = split "", uc ($seq->seq);
1513                 my $pos = 1;
1514                 for (my $x = 0 ; $x < $len ; $x++ ) {
1515                         if ($seq[$x] eq $consensus[$x]) {
1516                                 push @{$cigars{$seq->get_nse}},$pos;
1517                                 $pos++;
1518                         } elsif ($seq[$x] ne $gapchar) {
1519                                 $pos++;
1520                         }
1521                 }
1522         }
1523         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1524         for my $name (keys %cigars) {
1525                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1526                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1527       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1528            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1529                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1530                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1531                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1532                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1533                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1534                         }
1535       }
1536         }
1537   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1538   for my $name (keys %cigars) {
1539           my @remove;
1540           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1541                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1542                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1543                       unshift @remove,$x;
1544               }
1545            }
1546       for my $pos (@remove) {
1547                         splice @{$cigars{$name}}, $pos, 1;
1548            }
1549    }
1550    # join and punctuate
1551    for my $name (keys %cigars) {
1552           my ($start,$end,$str) = "";
1553           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1554                   $str .= ($start . "," . $end . ":");
1555           }
1556           $str =~ s/:$//;
1557       $cigars{$name} = $str;
1558    }
1559    %cigars;
1560 }
1561
1562
1563 =head2 match_line
1564
1565  Title    : match_line()
1566  Usage    : $line = $align->match_line()
1567  Function : Generates a match line - much like consensus string
1568             except that a line indicating the '*' for a match.
1569  Args     : (optional) Match line characters ('*' by default)
1570             (optional) Strong match char (':' by default)
1571             (optional) Weak match char ('.' by default)
1572  Returns  : String
1573
1574 =cut
1575
1576 sub match_line {
1577         my ($self,$matchlinechar, $strong, $weak) = @_;
1578         my %matchchars = ('match'    => $matchlinechar || '*',
1579                                                           'weak'     => $weak          || '.',
1580                                                           'strong'   => $strong        || ':',
1581                                                           'mismatch' => ' ',
1582                                                   );
1583
1584         my @seqchars;
1585         my $alphabet;
1586         foreach my $seq ( $self->each_seq ) {
1587                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1588                 $alphabet = $seq->alphabet unless defined $alphabet;
1589         }
1590         my $refseq = shift @seqchars;
1591         # let's just march down the columns
1592         my $matchline;
1593  POS:
1594         foreach my $pos ( 0..$self->length ) {
1595                 my $refchar = $refseq->[$pos];
1596                 my $char = $matchchars{'mismatch'};
1597                 unless( defined $refchar ) {
1598                         last if $pos == $self->length; # short circuit on last residue
1599                         # this in place to handle jason's soon-to-be-committed
1600                         # intron mapping code
1601                         goto bottom;
1602                 }
1603                 my %col = ($refchar => 1);
1604                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1605                 foreach my $seq ( @seqchars ) {
1606                         next if $pos >= scalar @$seq;
1607                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1608                                                           $seq->[$pos] eq ' ' );
1609                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1610                 }
1611                 my @colresidues = sort keys %col;
1612
1613                 # if all the values are the same
1614                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1615                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1616                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1617                         # matches for protein seqs
1618             TYPE:
1619                         foreach my $type ( qw(strong weak) ) {
1620                                 # iterate through categories
1621                                 my %groups;
1622                                 # iterate through each of the aa in the col
1623                                 # look to see which groups it is in
1624                                 foreach my $c ( @colresidues ) {
1625                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1626                                                 push @{$groups{$f}},$c;
1627                                         }
1628                                 }
1629                          GRP:
1630                                 foreach my $cols ( values %groups ) {
1631                                         @$cols = sort @$cols;
1632                                         # now we are just testing to see if two arrays
1633                                         # are identical w/o changing either one
1634                                         # have to be same len
1635                                         next if( scalar @$cols != scalar @colresidues );
1636                                         # walk down the length and check each slot
1637                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1638                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1639                                         }
1640                                         $char = $matchchars{$type};
1641                                         last TYPE;
1642                                 }
1643                         }
1644                 }
1645          bottom:
1646                 $matchline .= $char;
1647         }
1648         return $matchline;
1649 }
1650
1651
1652 =head2 gap_line
1653
1654  Title    : gap_line()
1655  Usage    : $line = $align->gap_line()
1656  Function : Generates a gap line - much like consensus string
1657             except that a line where '-' represents gap
1658  Args     : (optional) gap line characters ('-' by default)
1659  Returns  : string
1660
1661 =cut
1662
1663 sub gap_line {
1664     my ($self,$gapchar) = @_;
1665     $gapchar = $gapchar || $self->gap_char;
1666     my %gap_hsh; # column gaps vector
1667     foreach my $seq ( $self->each_seq ) {
1668                 my $i = 0;
1669         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1670                   map {[$i++, $_]} split(//, uc ($seq->seq));
1671     }
1672     my $gap_line;
1673     foreach my $pos ( 0..$self->length-1 ) {
1674           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1675     }
1676     return $gap_line;
1677 }
1678
1679 =head2 all_gap_line
1680
1681  Title    : all_gap_line()
1682  Usage    : $line = $align->all_gap_line()
1683  Function : Generates a gap line - much like consensus string
1684             except that a line where '-' represents all-gap column
1685  Args     : (optional) gap line characters ('-' by default)
1686  Returns  : string
1687
1688 =cut
1689
1690 sub all_gap_line {
1691     my ($self,$gapchar) = @_;
1692     $gapchar = $gapchar || $self->gap_char;
1693     my %gap_hsh;                # column gaps counter hash
1694     my @seqs = $self->each_seq;
1695     foreach my $seq ( @seqs ) {
1696         my $i = 0;
1697         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1698         map {[$i++, $_]} split(//, uc ($seq->seq));
1699     }
1700     my $gap_line;
1701     foreach my $pos ( 0..$self->length-1 ) {
1702         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1703             # gaps column
1704             $gap_line .= $gapchar;
1705         } else {
1706             $gap_line .= '.';
1707         }
1708     }
1709     return $gap_line;
1710 }
1711
1712 =head2 gap_col_matrix
1713
1714  Title    : gap_col_matrix()
1715  Usage    : my $cols = $align->gap_col_matrix()
1716  Function : Generates an array of hashes where
1717             each entry in the array is a hash reference
1718             with keys of all the sequence names and
1719             and value of 1 or 0 if the sequence has a gap at that column
1720  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1721
1722 =cut
1723
1724 sub gap_col_matrix {
1725     my ($self,$gapchar) = @_;
1726     $gapchar = $gapchar || $self->gap_char;
1727     my %gap_hsh; # column gaps vector
1728     my @cols;
1729     foreach my $seq ( $self->each_seq ) {
1730         my $i = 0;
1731         my $str = $seq->seq;
1732         my $len = $seq->length;
1733         my $ch;
1734         my $id = $seq->display_id;
1735         while( $i < $len ) {
1736             $ch = substr($str, $i, 1);
1737             $cols[$i++]->{$id} = ($ch eq $gapchar);
1738         }
1739     }
1740     return \@cols;
1741 }
1742
1743 =head2 match
1744
1745  Title     : match()
1746  Usage     : $ali->match()
1747  Function  : Goes through all columns and changes residues that are
1748              identical to residue in first sequence to match '.'
1749              character. Sets match_char.
1750
1751              USE WITH CARE: Most MSA formats do not support match
1752              characters in sequences, so this is mostly for output
1753              only. NEXUS format (Bio::AlignIO::nexus) can handle
1754              it.
1755  Returns   : 1
1756  Argument  : a match character, optional, defaults to '.'
1757
1758 =cut
1759
1760 sub match {
1761     my ($self, $match) = @_;
1762
1763     $match ||= '.';
1764     my ($matching_char) = $match;
1765     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1766     $self->map_chars($matching_char, '-');
1767
1768     my @seqs = $self->each_seq();
1769     return 1 unless scalar @seqs > 1;
1770
1771     my $refseq = shift @seqs ;
1772     my @refseq = split //, $refseq->seq;
1773     my $gapchar = $self->gap_char;
1774
1775     foreach my $seq ( @seqs ) {
1776         my @varseq = split //, $seq->seq();
1777         for ( my $i=0; $i < scalar @varseq; $i++) {
1778             $varseq[$i] = $match if defined $refseq[$i] &&
1779                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1780                   $refseq[$i] =~ /$gapchar/ )
1781                       && $refseq[$i] eq $varseq[$i];
1782         }
1783         $seq->seq(join '', @varseq);
1784     }
1785     $self->match_char($match);
1786     return 1;
1787 }
1788
1789
1790 =head2 unmatch
1791
1792  Title     : unmatch()
1793  Usage     : $ali->unmatch()
1794  Function  : Undoes the effect of method match. Unsets match_char.
1795  Returns   : 1
1796  Argument  : a match character, optional, defaults to '.'
1797
1798 See L<match> and L<match_char>
1799
1800 =cut
1801
1802 sub unmatch {
1803     my ($self, $match) = @_;
1804
1805     $match ||= '.';
1806
1807     my @seqs = $self->each_seq();
1808     return 1 unless scalar @seqs > 1;
1809
1810     my $refseq = shift @seqs ;
1811     my @refseq = split //, $refseq->seq;
1812     my $gapchar = $self->gap_char;
1813     foreach my $seq ( @seqs ) {
1814         my @varseq = split //, $seq->seq();
1815         for ( my $i=0; $i < scalar @varseq; $i++) {
1816             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1817                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1818                   $refseq[$i] =~ /$gapchar/ ) &&
1819                       $varseq[$i] eq $match;
1820         }
1821         $seq->seq(join '', @varseq);
1822     }
1823     $self->match_char('');
1824     return 1;
1825 }
1826
1827 =head1 MSA attributes
1828
1829 Methods for setting and reading the MSA attributes.
1830
1831 Note that the methods defining character semantics depend on the user
1832 to set them sensibly.  They are needed only by certain input/output
1833 methods. Unset them by setting to an empty string ('').
1834
1835 =head2 id
1836
1837  Title     : id
1838  Usage     : $myalign->id("Ig")
1839  Function  : Gets/sets the id field of the alignment
1840  Returns   : An id string
1841  Argument  : An id string (optional)
1842
1843 =cut
1844
1845 sub id {
1846     my ($self, $name) = @_;
1847
1848     if (defined( $name )) {
1849         $self->{'_id'} = $name;
1850     }
1851
1852     return $self->{'_id'};
1853 }
1854
1855 =head2 accession
1856
1857  Title     : accession
1858  Usage     : $myalign->accession("PF00244")
1859  Function  : Gets/sets the accession field of the alignment
1860  Returns   : An acc string
1861  Argument  : An acc string (optional)
1862
1863 =cut
1864
1865 sub accession {
1866     my ($self, $acc) = @_;
1867
1868     if (defined( $acc )) {
1869         $self->{'_accession'} = $acc;
1870     }
1871
1872     return $self->{'_accession'};
1873 }
1874
1875 =head2 description
1876
1877  Title     : description
1878  Usage     : $myalign->description("14-3-3 proteins")
1879  Function  : Gets/sets the description field of the alignment
1880  Returns   : An description string
1881  Argument  : An description string (optional)
1882
1883 =cut
1884
1885 sub description {
1886     my ($self, $name) = @_;
1887
1888     if (defined( $name )) {
1889         $self->{'_description'} = $name;
1890     }
1891
1892     return $self->{'_description'};
1893 }
1894
1895 =head2 missing_char
1896
1897  Title     : missing_char
1898  Usage     : $myalign->missing_char("?")
1899  Function  : Gets/sets the missing_char attribute of the alignment
1900              It is generally recommended to set it to 'n' or 'N'
1901              for nucleotides and to 'X' for protein.
1902  Returns   : An missing_char string,
1903  Argument  : An missing_char string (optional)
1904
1905 =cut
1906
1907 sub missing_char {
1908     my ($self, $char) = @_;
1909
1910     if (defined $char ) {
1911         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1912         $self->{'_missing_char'} = $char;
1913     }
1914
1915     return $self->{'_missing_char'};
1916 }
1917
1918 =head2 match_char
1919
1920  Title     : match_char
1921  Usage     : $myalign->match_char('.')
1922  Function  : Gets/sets the match_char attribute of the alignment
1923  Returns   : An match_char string,
1924  Argument  : An match_char string (optional)
1925
1926 =cut
1927
1928 sub match_char {
1929     my ($self, $char) = @_;
1930
1931     if (defined $char ) {
1932         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1933         $self->{'_match_char'} = $char;
1934     }
1935
1936     return $self->{'_match_char'};
1937 }
1938
1939 =head2 gap_char
1940
1941  Title     : gap_char
1942  Usage     : $myalign->gap_char('-')
1943  Function  : Gets/sets the gap_char attribute of the alignment
1944  Returns   : An gap_char string, defaults to '-'
1945  Argument  : An gap_char string (optional)
1946
1947 =cut
1948
1949 sub gap_char {
1950     my ($self, $char) = @_;
1951
1952     if (defined $char || ! defined $self->{'_gap_char'} ) {
1953         $char= '-' unless defined $char;
1954         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1955         $self->{'_gap_char'} = $char;
1956     }
1957     return $self->{'_gap_char'};
1958 }
1959
1960 =head2 symbol_chars
1961
1962  Title   : symbol_chars
1963  Usage   : my @symbolchars = $aln->symbol_chars;
1964  Function: Returns all the seen symbols (other than gaps)
1965  Returns : array of characters that are the seen symbols
1966  Args    : boolean to include the gap/missing/match characters
1967
1968 =cut
1969
1970 sub symbol_chars{
1971    my ($self,$includeextra) = @_;
1972
1973    unless ($self->{'_symbols'}) {
1974        foreach my $seq ($self->each_seq) {
1975            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1976        }
1977    }
1978    my %copy = %{$self->{'_symbols'}};
1979    if( ! $includeextra ) {
1980        foreach my $char ( $self->gap_char, $self->match_char,
1981                           $self->missing_char) {
1982            delete $copy{$char} if( defined $char );
1983        }
1984    }
1985    return keys %copy;
1986 }
1987
1988 =head1 Alignment descriptors
1989
1990 These read only methods describe the MSA in various ways.
1991
1992
1993 =head2 score
1994
1995  Title     : score
1996  Usage     : $str = $ali->score()
1997  Function  : get/set a score of the alignment
1998  Returns   : a score for the alignment
1999  Argument  : an optional score to set
2000
2001 =cut
2002
2003 sub score {
2004   my $self = shift;
2005   $self->{score} = shift if @_;
2006   return $self->{score};
2007 }
2008
2009 =head2 consensus_string
2010
2011  Title     : consensus_string
2012  Usage     : $str = $ali->consensus_string($threshold_percent)
2013  Function  : Makes a strict consensus
2014  Returns   : Consensus string
2015  Argument  : Optional treshold ranging from 0 to 100.
2016              The consensus residue has to appear at least threshold %
2017              of the sequences at a given location, otherwise a '?'
2018              character will be placed at that location.
2019              (Default value = 0%)
2020
2021 =cut
2022
2023 sub consensus_string {
2024     my $self = shift;
2025     my $threshold = shift;
2026
2027     my $out = "";
2028     my $len = $self->length - 1;
2029
2030     foreach ( 0 .. $len ) {
2031         $out .= $self->_consensus_aa($_,$threshold);
2032     }
2033     return $out;
2034 }
2035
2036 sub _consensus_aa {
2037     my $self = shift;
2038     my $point = shift;
2039     my $threshold_percent = shift || -1 ;
2040     my ($seq,%hash,$count,$letter,$key);
2041     my $gapchar = $self->gap_char;
2042     foreach $seq ( $self->each_seq() ) {
2043         $letter = substr($seq->seq,$point,1);
2044         $self->throw("--$point-----------") if $letter eq '';
2045         ($letter eq $gapchar || $letter =~ /\./) && next;
2046         # print "Looking at $letter\n";
2047         $hash{$letter}++;
2048     }
2049     my $number_of_sequences = $self->num_sequences();
2050     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2051     $count = -1;
2052     $letter = '?';
2053
2054     foreach $key ( sort keys %hash ) {
2055         # print "Now at $key $hash{$key}\n";
2056         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2057             $letter = $key;
2058             $count = $hash{$key};
2059         }
2060     }
2061     return $letter;
2062 }
2063
2064
2065 =head2 consensus_iupac
2066
2067  Title     : consensus_iupac
2068  Usage     : $str = $ali->consensus_iupac()
2069  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2070              and RNA. The output is in upper case except when gaps in
2071              a column force output to be in lower case.
2072
2073              Note that if your alignment sequences contain a lot of
2074              IUPAC ambiquity codes you often have to manually set
2075              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2076              indicate a protein sequence.
2077  Returns   : consensus string
2078  Argument  : none
2079  Throws    : on protein sequences
2080
2081 =cut
2082
2083 sub consensus_iupac {
2084     my $self = shift;
2085     my $out = "";
2086     my $len = $self->length-1;
2087
2088     # only DNA and RNA sequences are valid
2089     foreach my $seq ( $self->each_seq() ) {
2090         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2091             if $seq->alphabet eq 'protein';
2092     }
2093     # loop over the alignment columns
2094     foreach my $count ( 0 .. $len ) {
2095         $out .= $self->_consensus_iupac($count);
2096     }
2097     return $out;
2098 }
2099
2100 sub _consensus_iupac {
2101     my ($self, $column) = @_;
2102     my ($string, $char, $rna);
2103
2104     #determine all residues in a column
2105     foreach my $seq ( $self->each_seq() ) {
2106         $string .= substr($seq->seq, $column, 1);
2107     }
2108     $string = uc $string;
2109
2110     # quick exit if there's an N in the string
2111     if ($string =~ /N/) {
2112         $string =~ /\W/ ? return 'n' : return 'N';
2113     }
2114     # ... or if there are only gap characters
2115     return '-' if $string =~ /^\W+$/;
2116
2117     # treat RNA as DNA in regexps
2118     if ($string =~ /U/) {
2119         $string =~ s/U/T/;
2120         $rna = 1;
2121     }
2122
2123     # the following s///'s only need to be done to the _first_ ambiguity code
2124     # as we only need to see the _range_ of characters in $string
2125
2126     if ($string =~ /[VDHB]/) {
2127         $string =~ s/V/AGC/;
2128         $string =~ s/D/AGT/;
2129         $string =~ s/H/ACT/;
2130         $string =~ s/B/CTG/;
2131     }
2132
2133     if ($string =~ /[SKYRWM]/) {
2134         $string =~ s/S/GC/;
2135         $string =~ s/K/GT/;
2136         $string =~ s/Y/CT/;
2137         $string =~ s/R/AG/;
2138         $string =~ s/W/AT/;
2139         $string =~ s/M/AC/;
2140     }
2141
2142     # and now the guts of the thing
2143
2144     if ($string =~ /A/) {
2145         $char = 'A';                     # A                      A
2146         if ($string =~ /G/) {
2147             $char = 'R';                 # A and G (purines)      R
2148             if ($string =~ /C/) {
2149                 $char = 'V';             # A and G and C          V
2150                 if ($string =~ /T/) {
2151                     $char = 'N';         # A and G and C and T    N
2152                 }
2153             } elsif ($string =~ /T/) {
2154                 $char = 'D';             # A and G and T          D
2155             }
2156         } elsif ($string =~ /C/) {
2157             $char = 'M';                 # A and C                M
2158             if ($string =~ /T/) {
2159                 $char = 'H';             # A and C and T          H
2160             }
2161         } elsif ($string =~ /T/) {
2162             $char = 'W';                 # A and T                W
2163         }
2164     } elsif ($string =~ /C/) {
2165         $char = 'C';                     # C                      C
2166         if ($string =~ /T/) {
2167             $char = 'Y';                 # C and T (pyrimidines)  Y
2168             if ($string =~ /G/) {
2169                 $char = 'B';             # C and T and G          B
2170             }
2171         } elsif ($string =~ /G/) {
2172             $char = 'S';                 # C and G                S
2173         }
2174     } elsif ($string =~ /G/) {
2175         $char = 'G';                     # G                      G
2176         if ($string =~ /C/) {
2177             $char = 'S';                 # G and C                S
2178         } elsif ($string =~ /T/) {
2179             $char = 'K';                 # G and T                K
2180         }
2181     } elsif ($string =~ /T/) {
2182         $char = 'T';                     # T                      T
2183     }
2184
2185     $char = 'U' if $rna and $char eq 'T';
2186     $char = lc $char if $string =~ /\W/;
2187
2188     return $char;
2189 }
2190
2191
2192 =head2 consensus_meta
2193
2194  Title     : consensus_meta
2195  Usage     : $seqmeta = $ali->consensus_meta()
2196  Function  : Returns a Bio::Seq::Meta object containing the consensus
2197              strings derived from meta data analysis.
2198  Returns   : Bio::Seq::Meta
2199  Argument  : Bio::Seq::Meta
2200  Throws    : non-MetaI object
2201
2202 =cut
2203
2204 sub consensus_meta {
2205     my ($self, $meta) = @_;
2206     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2207         $self->throw('Not a Bio::Seq::MetaI object');
2208     }
2209     return $self->{'_aln_meta'} = $meta if $meta;
2210     return $self->{'_aln_meta'}
2211 }
2212
2213 =head2 is_flush
2214
2215  Title     : is_flush
2216  Usage     : if ( $ali->is_flush() )
2217  Function  : Tells you whether the alignment
2218            : is flush, i.e. all of the same length
2219  Returns   : 1 or 0
2220  Argument  :
2221
2222 =cut
2223
2224 sub is_flush {
2225     my ($self,$report) = @_;
2226     my $seq;
2227     my $length = (-1);
2228     my $temp;
2229
2230     foreach $seq ( $self->each_seq() ) {
2231         if( $length == (-1) ) {
2232             $length = CORE::length($seq->seq());
2233             next;
2234         }
2235
2236         $temp = CORE::length($seq->seq());
2237         if( $temp != $length ) {
2238             $self->warn("expecting $length not $temp from ".
2239                         $seq->display_id) if( $report );
2240             $self->debug("expecting $length not $temp from ".
2241                          $seq->display_id);
2242             $self->debug($seq->seq(). "\n");
2243             return 0;
2244         }
2245     }
2246
2247     return 1;
2248 }
2249
2250
2251 =head2 length
2252
2253  Title     : length()
2254  Usage     : $len = $ali->length()
2255  Function  : Returns the maximum length of the alignment.
2256              To be sure the alignment is a block, use is_flush
2257  Returns   : Integer
2258  Argument  :
2259
2260 =cut
2261
2262 sub length_aln {
2263     my $self = shift;
2264     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2265     $self->length(@_);
2266 }
2267
2268 sub length {
2269     my $self = shift;
2270     my $seq;
2271     my $length = -1;
2272     my $temp;
2273
2274     foreach $seq ( $self->each_seq() ) {
2275         $temp = $seq->length();
2276         if( $temp > $length ) {
2277             $length = $temp;
2278         }
2279     }
2280
2281     return $length;
2282 }
2283
2284
2285 =head2 maxdisplayname_length
2286
2287  Title     : maxdisplayname_length
2288  Usage     : $ali->maxdisplayname_length()
2289  Function  : Gets the maximum length of the displayname in the
2290              alignment. Used in writing out various MSA formats.
2291  Returns   : integer
2292  Argument  :
2293
2294 =cut
2295
2296 sub maxname_length {
2297     my $self = shift;
2298     $self->deprecated("maxname_length - deprecated method.".
2299                       " Use maxdisplayname_length() instead.");
2300     $self->maxdisplayname_length();
2301 }
2302
2303 sub maxnse_length {
2304     my $self = shift;
2305     $self->deprecated("maxnse_length - deprecated method.".
2306                       " Use maxnse_length() instead.");
2307     $self->maxdisplayname_length();
2308 }
2309
2310 sub maxdisplayname_length {
2311     my $self = shift;
2312     my $maxname = (-1);
2313     my ($seq,$len);
2314
2315     foreach $seq ( $self->each_seq() ) {
2316         $len = CORE::length $self->displayname($seq->get_nse());
2317
2318         if( $len > $maxname ) {
2319             $maxname = $len;
2320         }
2321     }
2322
2323     return $maxname;
2324 }
2325
2326 =head2 max_metaname_length
2327
2328  Title     : max_metaname_length
2329  Usage     : $ali->max_metaname_length()
2330  Function  : Gets the maximum length of the meta name tags in the
2331              alignment for the sequences and for the alignment.
2332              Used in writing out various MSA formats.
2333  Returns   : integer
2334  Argument  : None
2335
2336 =cut
2337
2338 sub max_metaname_length {
2339     my $self = shift;
2340     my $maxname = (-1);
2341     my ($seq,$len);
2342
2343     # check seq meta first
2344     for $seq ( $self->each_seq() ) {
2345         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2346         for my $mtag ($seq->meta_names) {
2347             $len = CORE::length $mtag;
2348             if( $len > $maxname ) {
2349                 $maxname = $len;
2350             }
2351         }
2352     }
2353
2354     # alignment meta
2355     for my $meta ($self->consensus_meta) {
2356         next unless $meta;
2357         for my $name ($meta->meta_names) {
2358             $len = CORE::length $name;
2359             if( $len > $maxname ) {
2360                 $maxname = $len;
2361             }
2362         }
2363     }
2364
2365     return $maxname;
2366 }
2367
2368 =head2 num_residues
2369
2370  Title     : num_residues
2371  Usage     : $no = $ali->num_residues
2372  Function  : number of residues in total in the alignment
2373  Returns   : integer
2374  Argument  :
2375  Note      : replaces no_residues()
2376
2377 =cut
2378
2379 sub num_residues {
2380     my $self = shift;
2381     my $count = 0;
2382
2383     foreach my $seq ($self->each_seq) {
2384         my $str = $seq->seq();
2385
2386         $count += ($str =~ s/[A-Za-z]//g);
2387     }
2388
2389     return $count;
2390 }
2391
2392 =head2 num_sequences
2393
2394  Title     : num_sequences
2395  Usage     : $depth = $ali->num_sequences
2396  Function  : number of sequence in the sequence alignment
2397  Returns   : integer
2398  Argument  : none
2399  Note      : replaces no_sequences()
2400
2401 =cut
2402
2403 sub num_sequences {
2404     my $self = shift;
2405     return scalar($self->each_seq);
2406 }
2407
2408
2409 =head2 average_percentage_identity
2410
2411  Title   : average_percentage_identity
2412  Usage   : $id = $align->average_percentage_identity
2413  Function: The function uses a fast method to calculate the average
2414            percentage identity of the alignment
2415  Returns : The average percentage identity of the alignment
2416  Args    : None
2417  Notes   : This method implemented by Kevin Howe calculates a figure that is
2418            designed to be similar to the average pairwise identity of the
2419            alignment (identical in the absence of gaps), without having to
2420            explicitly calculate pairwise identities proposed by Richard Durbin.
2421            Validated by Ewan Birney ad Alex Bateman.
2422
2423 =cut
2424
2425 sub average_percentage_identity{
2426    my ($self,@args) = @_;
2427
2428    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2429                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2430
2431    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2432
2433    if (! $self->is_flush()) {
2434        $self->throw("All sequences in the alignment must be the same length");
2435    }
2436
2437    @seqs = $self->each_seq();
2438    $len = $self->length();
2439
2440    # load the each hash with correct keys for existence checks
2441
2442    for( my $index=0; $index < $len; $index++) {
2443        foreach my $letter (@alphabet) {
2444            $countHashes[$index]->{$letter} = 0;
2445        }
2446    }
2447    foreach my $seq (@seqs)  {
2448        my @seqChars = split //, $seq->seq();
2449        for( my $column=0; $column < @seqChars; $column++ ) {
2450            my $char = uc($seqChars[$column]);
2451            if (exists $countHashes[$column]->{$char}) {
2452                $countHashes[$column]->{$char}++;
2453            }
2454        }
2455    }
2456
2457    $total = 0;
2458    $divisor = 0;
2459    for(my $column =0; $column < $len; $column++) {
2460        my %hash = %{$countHashes[$column]};
2461        $subdivisor = 0;
2462        foreach my $res (keys %hash) {
2463            $total += $hash{$res}*($hash{$res} - 1);
2464            $subdivisor += $hash{$res};
2465        }
2466        $divisor += $subdivisor * ($subdivisor - 1);
2467    }
2468    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2469 }
2470
2471 =head2 percentage_identity
2472
2473  Title   : percentage_identity
2474  Usage   : $id = $align->percentage_identity
2475  Function: The function calculates the average percentage identity
2476            (aliased to average_percentage_identity)
2477  Returns : The average percentage identity
2478  Args    : None
2479
2480 =cut
2481
2482 sub percentage_identity {
2483     my $self = shift;
2484     return $self->average_percentage_identity();
2485 }
2486
2487 =head2 overall_percentage_identity
2488
2489  Title   : overall_percentage_identity
2490  Usage   : $id = $align->overall_percentage_identity
2491            $id = $align->overall_percentage_identity('short')
2492  Function: The function calculates the percentage identity of
2493            the conserved columns
2494  Returns : The percentage identity of the conserved columns
2495  Args    : length value to use, optional defaults to alignment length
2496                  possible values: 'align', 'short', 'long'
2497
2498 The argument values 'short' and 'long' refer to shortest and longest
2499 sequence in the alignment. Method modification code by Hongyu Zhang.
2500
2501 =cut
2502
2503 sub overall_percentage_identity{
2504    my ($self, $length_measure) = @_;
2505
2506    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2507                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2508
2509    my ($len, $total, @seqs, @countHashes);
2510
2511    my %enum = map {$_ => 1} qw (align short long);
2512
2513    $self->throw("Unknown argument [$length_measure]")
2514        if $length_measure and not $enum{$length_measure};
2515    $length_measure ||= 'align';
2516
2517    if (! $self->is_flush()) {
2518        $self->throw("All sequences in the alignment must be the same length");
2519    }
2520
2521    @seqs = $self->each_seq();
2522    $len = $self->length();
2523
2524    # load the each hash with correct keys for existence checks
2525    for( my $index=0; $index < $len; $index++) {
2526        foreach my $letter (@alphabet) {
2527            $countHashes[$index]->{$letter} = 0;
2528        }
2529    }
2530    foreach my $seq (@seqs)  {
2531        my @seqChars = split //, $seq->seq();
2532        for( my $column=0; $column < @seqChars; $column++ ) {
2533            my $char = uc($seqChars[$column]);
2534            if (exists $countHashes[$column]->{$char}) {
2535                $countHashes[$column]->{$char}++;
2536            }
2537        }
2538    }
2539
2540    $total = 0;
2541    for(my $column =0; $column < $len; $column++) {
2542        my %hash = %{$countHashes[$column]};
2543        foreach ( values %hash ) {
2544            next if( $_ == 0 );
2545            $total++ if( $_ == scalar @seqs );
2546            last;
2547        }
2548    }
2549
2550    if ($length_measure eq 'short') {
2551        ## find the shortest length
2552        $len = 0;
2553        foreach my $seq ($self->each_seq) {
2554            my $count = $seq->seq =~ tr/[A-Za-z]//;
2555            if ($len) {
2556                $len = $count if $count < $len;
2557            } else {
2558                $len = $count;
2559            }
2560        }
2561    }
2562    elsif ($length_measure eq 'long') {
2563        ## find the longest length
2564        $len = 0;
2565        foreach my $seq ($self->each_seq) {
2566            my $count = $seq->seq =~ tr/[A-Za-z]//;
2567            if ($len) {
2568                $len = $count if $count > $len;
2569            } else {
2570                $len = $count;
2571            }
2572        }
2573    }
2574
2575    return ($total / $len ) * 100.0;
2576 }
2577
2578
2579
2580 =head1 Alignment positions
2581
2582 Methods to map a sequence position into an alignment column and back.
2583 column_from_residue_number() does the former. The latter is really a
2584 property of the sequence object and can done using
2585 L<Bio::LocatableSeq::location_from_column>:
2586
2587     # select somehow a sequence from the alignment, e.g.
2588     my $seq = $aln->get_seq_by_pos(1);
2589     #$loc is undef or Bio::LocationI object
2590     my $loc = $seq->location_from_column(5);
2591
2592 =head2 column_from_residue_number
2593
2594  Title   : column_from_residue_number
2595  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2596  Function: This function gives the position in the alignment
2597            (i.e. column number) of the given residue number in the
2598            sequence with the given name. For example, for the
2599            alignment
2600
2601              Seq1/91-97 AC..DEF.GH.
2602              Seq2/24-30 ACGG.RTY...
2603                 Seq3/43-51 AC.DDEF.GHI
2604
2605            column_from_residue_number( "Seq1", 94 ) returns 6.
2606            column_from_residue_number( "Seq2", 25 ) returns 2.
2607            column_from_residue_number( "Seq3", 50 ) returns 10.
2608
2609            An exception is thrown if the residue number would lie
2610            outside the length of the aligment
2611            (e.g. column_from_residue_number( "Seq2", 22 )
2612
2613           Note: If the the parent sequence is represented by more than
2614                 one alignment sequence and the residue number is present in
2615                 them, this method finds only the first one.
2616
2617  Returns : A column number for the position in the alignment of the
2618            given residue in the given sequence (1 = first column)
2619  Args    : A sequence id/name (not a name/start-end)
2620            A residue number in the whole sequence (not just that
2621            segment of it in the alignment)
2622
2623 =cut
2624
2625 sub column_from_residue_number {
2626     my ($self, $name, $resnumber) = @_;
2627
2628     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2629     $self->throw("Second argument residue number missing") unless $resnumber;
2630
2631     foreach my $seq ($self->each_seq_with_id($name)) {
2632         my $col;
2633         eval {
2634             $col = $seq->column_from_residue_number($resnumber);
2635         };
2636         next if $@;
2637         return $col;
2638     }
2639
2640     $self->throw("Could not find a sequence segment in $name ".
2641                  "containing residue number $resnumber");
2642
2643 }
2644
2645 =head1 Sequence names
2646
2647 Methods to manipulate the display name. The default name based on the
2648 sequence id and subsequence positions can be overridden in various
2649 ways.
2650
2651 =head2 displayname
2652
2653  Title     : displayname
2654  Usage     : $myalign->displayname("Ig", "IgA")
2655  Function  : Gets/sets the display name of a sequence in the alignment
2656  Returns   : A display name string
2657  Argument  : name of the sequence
2658              displayname of the sequence (optional)
2659
2660 =cut
2661
2662 sub get_displayname {
2663     my $self = shift;
2664     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2665     $self->displayname(@_);
2666 }
2667
2668 sub set_displayname {
2669     my $self = shift;
2670     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2671     $self->displayname(@_);
2672 }
2673
2674 sub displayname {
2675     my ($self, $name, $disname) = @_;
2676
2677     $self->throw("No sequence with name [$name]")
2678         unless defined $self->{'_seq'}->{$name};
2679
2680     if(  $disname and  $name) {
2681         $self->{'_dis_name'}->{$name} = $disname;
2682         return $disname;
2683     }
2684     elsif( defined $self->{'_dis_name'}->{$name} ) {
2685         return  $self->{'_dis_name'}->{$name};
2686     } else {
2687         return $name;
2688     }
2689 }
2690
2691 =head2 set_displayname_count
2692
2693  Title     : set_displayname_count
2694  Usage     : $ali->set_displayname_count
2695  Function  : Sets the names to be name_# where # is the number of
2696              times this name has been used.
2697  Returns   : 1, on success
2698  Argument  :
2699
2700 =cut
2701
2702 sub set_displayname_count {
2703     my $self= shift;
2704     my (@arr,$name,$seq,$count,$temp,$nse);
2705
2706     foreach $seq ( $self->each_alphabetically() ) {
2707         $nse = $seq->get_nse();
2708
2709         #name will be set when this is the second
2710         #time (or greater) is has been seen
2711
2712         if( defined $name and $name eq ($seq->id()) ) {
2713             $temp = sprintf("%s_%s",$name,$count);
2714             $self->displayname($nse,$temp);
2715             $count++;
2716         } else {
2717             $count = 1;
2718             $name = $seq->id();
2719             $temp = sprintf("%s_%s",$name,$count);
2720             $self->displayname($nse,$temp);
2721             $count++;
2722         }
2723     }
2724     return 1;
2725 }
2726
2727 =head2 set_displayname_flat
2728
2729  Title     : set_displayname_flat
2730  Usage     : $ali->set_displayname_flat()
2731  Function  : Makes all the sequences be displayed as just their name,
2732              not name/start-end
2733  Returns   : 1
2734  Argument  :
2735
2736 =cut
2737
2738 sub set_displayname_flat {
2739     my $self = shift;
2740     my ($nse,$seq);
2741
2742     foreach $seq ( $self->each_seq() ) {
2743         $nse = $seq->get_nse();
2744         $self->displayname($nse,$seq->id());
2745     }
2746     return 1;
2747 }
2748
2749 =head2 set_displayname_normal
2750
2751  Title     : set_displayname_normal
2752  Usage     : $ali->set_displayname_normal()
2753  Function  : Makes all the sequences be displayed as name/start-end
2754  Returns   : 1, on success
2755  Argument  :
2756
2757 =cut
2758
2759 sub set_displayname_normal {
2760     my $self = shift;
2761     my ($nse,$seq);
2762
2763     foreach $seq ( $self->each_seq() ) {
2764         $nse = $seq->get_nse();
2765         $self->displayname($nse,$nse);
2766     }
2767     return 1;
2768 }
2769
2770 =head2 source
2771
2772  Title   : source
2773  Usage   : $obj->source($newval)
2774  Function: sets the Alignment source program
2775  Example :
2776  Returns : value of source
2777  Args    : newvalue (optional)
2778
2779
2780 =cut
2781
2782 sub source{
2783    my ($self,$value) = @_;
2784    if( defined $value) {
2785       $self->{'_source'} = $value;
2786     }
2787     return $self->{'_source'};
2788 }
2789
2790 =head2 set_displayname_safe
2791
2792  Title     : set_displayname_safe
2793  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2794  Function  : Assign machine-generated serial names to sequences in input order.
2795              Designed to protect names during PHYLIP runs. Assign 10-char string
2796              in the form of "S000000001" to "S999999999". Restore the original
2797              names using "restore_displayname".
2798  Returns   : 1. a new $aln with system names;
2799              2. a hash ref for restoring names
2800  Argument  : Number for id length (default 10)
2801
2802 =cut
2803
2804 sub set_displayname_safe {
2805     my $self = shift;
2806     my $idlength = shift || 10;
2807     my ($seq, %phylip_name);
2808     my $ct=0;
2809     my $new=Bio::SimpleAlign->new();
2810     foreach $seq ( $self->each_seq() ) {
2811         $ct++;
2812         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2813         $phylip_name{$pname}=$seq->id();
2814         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2815                                             -seq      => $seq->seq(),
2816                                             -alphabet => $seq->alphabet,
2817                                             -start    => $seq->{_start},
2818                                             -end      => $seq->{_end}
2819                                             );
2820         $new->add_seq($new_seq);
2821     }
2822
2823     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2824     return ($new, \%phylip_name);
2825 }
2826
2827 =head2 restore_displayname
2828
2829  Title     : restore_displayname
2830  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2831  Function  : Restore original sequence names (after running
2832              $ali->set_displayname_safe)
2833  Returns   : a new $aln with names restored.
2834  Argument  : a hash reference of names from "set_displayname_safe".
2835
2836 =cut
2837
2838 sub restore_displayname {
2839     my $self = shift;
2840     my $ref=shift;
2841     my %name=%$ref;
2842     my $new=Bio::SimpleAlign->new();
2843     foreach my $seq ( $self->each_seq() ) {
2844       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2845       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2846                                           -seq      => $seq->seq(),
2847                                           -alphabet => $seq->alphabet,
2848                                           -start    => $seq->{_start},
2849                                           -end      => $seq->{_end}
2850                                           );
2851       $new->add_seq($new_seq);
2852     }
2853     return $new;
2854 }
2855
2856 =head2 sort_by_start
2857
2858  Title     : sort_by_start
2859  Usage     : $ali->sort_by_start
2860  Function  : Changes the order of the alignment to the start position of each
2861              subalignment
2862  Returns   :
2863  Argument  :
2864
2865 =cut
2866
2867 sub sort_by_start {
2868     my $self = shift;
2869     my ($seq,$nse,@arr,%hash,$count);
2870     foreach $seq ( $self->each_seq() ) {
2871         $nse = $seq->get_nse;
2872         $hash{$nse} = $seq;
2873     }
2874     $count = 0;
2875     %{$self->{'_order'}} = (); # reset the hash;
2876     foreach $nse ( sort _startend keys %hash) {
2877         $self->{'_order'}->{$count} = $nse;
2878         $count++;
2879     }
2880     1;
2881 }
2882
2883 sub _startend
2884 {
2885     my ($aname,$arange) = split (/[\/]/,$a);
2886     my ($bname,$brange) = split (/[\/]/,$b);
2887     my ($astart,$aend) = split(/\-/,$arange);
2888     my ($bstart,$bend) = split(/\-/,$brange);
2889     return $astart <=> $bstart;
2890 }
2891
2892 =head2 bracket_string
2893
2894  Title     : bracket_string
2895  Usage     : my @params = (-refseq     => 'testseq',
2896                            -allele1    => 'allele1',
2897                            -allele2    => 'allele2',
2898                            -delimiters => '{}',
2899                            -separator  => '/');
2900              $str = $aln->bracket_string(@params)
2901
2902  Function :  When supplied with a list of parameters (see below), returns a
2903              string in BIC format. This is used for allelic comparisons.
2904              Briefly, if either allele contains a base change when compared to
2905              the refseq, the base or gap for each allele is represented in
2906              brackets in the order present in the 'alleles' parameter.
2907
2908              For the following data:
2909
2910              >testseq
2911              GGATCCATTGCTACT
2912              >allele1
2913              GGATCCATTCCTACT
2914              >allele2
2915              GGAT--ATTCCTCCT
2916
2917              the returned string with parameters 'refseq => testseq' and
2918              'alleles => [qw(allele1 allele2)]' would be:
2919
2920              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2921  Returns   : BIC-formatted string
2922  Argument  : Required args
2923                 refseq    : string (ID) of the reference sequence used
2924                             as basis for comparison
2925                 allele1   : string (ID) of the first allele
2926                 allele2   : string (ID) of the second allele
2927              Optional args
2928                 delimiters: two symbol string of left and right delimiters.
2929                             Only the first two symbols are used
2930                             default = '[]'
2931                 separator : string used as a separator.  Only the first
2932                             symbol is used
2933                             default = '/'
2934  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2935
2936 =cut
2937
2938 sub bracket_string {
2939     my ($self, @args) = @_;
2940     my ($ref, $a1, $a2, $delim, $sep) =
2941         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2942     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2943     my ($ld, $rd);
2944     ($ld, $rd) = split('', $delim, 2) if $delim;
2945     $ld ||= '[';
2946     $rd ||= ']';
2947     $sep ||= '/';
2948     my ($refseq, $allele1, $allele2) =
2949         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2950     if (!$refseq || !$allele1 || !$allele2) {
2951         $self->throw("One of your refseq/allele IDs is invalid!");
2952     }
2953     my $len = $self->length-1;
2954     my $bic = '';
2955     # loop over the alignment columns
2956     for my $column ( 0 .. $len ) {
2957         my $string;
2958         my ($compres, $res1, $res2) =
2959             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2960         # are any of the allele symbols different from the refseq?
2961         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2962                 $ld.$res1.$sep.$res2.$rd;
2963         $bic .= $string;
2964     }
2965     return $bic;
2966 }
2967
2968
2969 =head2 methods implementing Bio::FeatureHolderI
2970
2971 FeatureHolderI implementation to support labeled character sets like one
2972 would get from NEXUS represented data.
2973
2974 =head2 get_SeqFeatures
2975
2976  Usage   : @features = $aln->get_SeqFeatures
2977  Function: Get the feature objects held by this feature holder.
2978  Example :
2979  Returns : an array of Bio::SeqFeatureI implementing objects
2980  Args    : optional filter coderef, taking a Bio::SeqFeatureI
2981          : as argument, returning TRUE if wanted, FALSE if
2982          : unwanted
2983
2984 =cut
2985
2986 sub get_SeqFeatures {
2987     my $self = shift;
2988     my $filter_cb = shift;
2989     $self->throw("Arg (filter callback) must be a coderef") unless
2990         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2991     if( !defined $self->{'_as_feat'} ) {
2992         $self->{'_as_feat'} = [];
2993     }
2994     if ($filter_cb) {
2995         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2996     }
2997     return @{$self->{'_as_feat'}};
2998 }
2999
3000 =head2 add_SeqFeature
3001
3002  Usage   : $aln->add_SeqFeature($subfeat);
3003  Function: adds a SeqFeature into the SeqFeature array.
3004  Example :
3005  Returns : true on success
3006  Args    : a Bio::SeqFeatureI object
3007  Note    : This implementation is not compliant
3008            with Bio::FeatureHolderI
3009
3010 =cut
3011
3012 sub add_SeqFeature {
3013    my ($self,@feat) = @_;
3014
3015    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3016
3017    foreach my $feat ( @feat ) {
3018        if( !$feat->isa("Bio::SeqFeatureI") ) {
3019            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3020        }
3021
3022        push(@{$self->{'_as_feat'}},$feat);
3023    }
3024    return 1;
3025 }
3026
3027
3028 =head2 remove_SeqFeatures
3029
3030  Usage   : $obj->remove_SeqFeatures
3031  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3032            remove that subset from the returned array, and add back the rest.
3033  Returns : The array of Bio::SeqFeatureI features that was
3034            deleted from this alignment.
3035  Args    : none
3036
3037 =cut
3038
3039 sub remove_SeqFeatures {
3040     my $self = shift;
3041
3042     return () unless $self->{'_as_feat'};
3043     my @feats = @{$self->{'_as_feat'}};
3044     $self->{'_as_feat'} = [];
3045     return @feats;
3046 }
3047
3048 =head2 feature_count
3049
3050  Title   : feature_count
3051  Usage   : $obj->feature_count()
3052  Function: Return the number of SeqFeatures attached to the alignment
3053  Returns : integer representing the number of SeqFeatures
3054  Args    : None
3055
3056 =cut
3057
3058 sub feature_count {
3059     my ($self) = @_;
3060
3061     if (defined($self->{'_as_feat'})) {
3062         return ($#{$self->{'_as_feat'}} + 1);
3063     } else {
3064         return 0;
3065     }
3066 }
3067
3068 =head2 get_all_SeqFeatures
3069
3070  Title   : get_all_SeqFeatures
3071  Usage   :
3072  Function: Get all SeqFeatures.
3073  Example :
3074  Returns : an array of Bio::SeqFeatureI implementing objects
3075  Args    : none
3076  Note    : Falls through to Bio::FeatureHolderI implementation.
3077
3078 =cut
3079
3080 =head2 methods for Bio::AnnotatableI
3081
3082 AnnotatableI implementation to support sequence alignments which
3083 contain annotation (NEXUS, Stockholm).
3084
3085 =head2 annotation
3086
3087  Title   : annotation
3088  Usage   : $ann = $aln->annotation or
3089            $aln->annotation($ann)
3090  Function: Gets or sets the annotation
3091  Returns : Bio::AnnotationCollectionI object
3092  Args    : None or Bio::AnnotationCollectionI object
3093
3094 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3095 for more information
3096
3097 =cut
3098
3099 sub annotation {
3100     my ($obj,$value) = @_;
3101     if( defined $value ) {
3102         $obj->throw("object of class ".ref($value)." does not implement ".
3103                 "Bio::AnnotationCollectionI. Too bad.")
3104             unless $value->isa("Bio::AnnotationCollectionI");
3105         $obj->{'_annotation'} = $value;
3106     } elsif( ! defined $obj->{'_annotation'}) {
3107         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3108     }
3109     return $obj->{'_annotation'};
3110 }
3111
3112 =head1 Deprecated methods
3113
3114 =cut
3115
3116 =head2 no_residues
3117
3118  Title     : no_residues
3119  Usage     : $no = $ali->no_residues
3120  Function  : number of residues in total in the alignment
3121  Returns   : integer
3122  Argument  :
3123  Note      : deprecated in favor of num_residues()
3124
3125 =cut
3126
3127 sub no_residues {
3128         my $self = shift;
3129         $self->deprecated(-warn_version => 1.0069,
3130                                           -throw_version => 1.0075,
3131                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3132     $self->num_residues(@_);
3133 }
3134
3135 =head2 no_sequences
3136
3137  Title     : no_sequences
3138  Usage     : $depth = $ali->no_sequences
3139  Function  : number of sequence in the sequence alignment
3140  Returns   : integer
3141  Argument  :
3142  Note      : deprecated in favor of num_sequences()
3143
3144 =cut
3145
3146 sub no_sequences {
3147         my $self = shift;
3148         $self->deprecated(-warn_version => 1.0069,
3149                                           -throw_version => 1.0075,
3150                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3151     $self->num_sequences(@_);
3152 }
3153
3154 =head2 mask_columns
3155
3156  Title     : mask_columns
3157  Usage     : $aln2 = $aln->mask_columns(20,30)
3158  Function  : Masks a slice of the alignment inclusive of start and
3159              end columns, and the first column in the alignment is denoted 1.
3160              Mask beyond the length of the sequence does not do padding.
3161  Returns   : A Bio::SimpleAlign object
3162  Args      : Positive integer for start column, positive integer for end column,
3163              optional string value use for the mask. Example:
3164
3165              $aln2 = $aln->mask_columns(20,30,'?')
3166  Note      : Masking must use a character that is not used for gaps or
3167              frameshifts.  These can be adjusted using the relevant global
3168              variables, but be aware these may be (uncontrollably) modified
3169              elsewhere within BioPerl (see bug 2715)
3170
3171 =cut
3172
3173 sub mask_columns {
3174     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3175     my $self = shift;
3176
3177     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3178              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3179
3180     # coordinates are alignment-based, not sequence-based
3181     my ($start, $end, $mask_char) = @_;
3182     unless (defined $mask_char) { $mask_char = 'N' }
3183
3184     $self->throw("Mask start has to be a positive integer and less than ".
3185                  "alignment length, not [$start]")
3186       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3187     $self->throw("Mask end has to be a positive integer and less than ".
3188                  "alignment length, not [$end]")
3189       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3190     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3191                  "end [$end]") unless $start <= $end;
3192     $self->throw("Mask character $mask_char has to be a single character ".
3193                  "and not a gap or frameshift symbol")
3194       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3195
3196     my $aln = $self->new;
3197     $aln->id($self->id);
3198     foreach my $seq ( $self->each_seq() ) {
3199         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3200          -alphabet => $seq->alphabet,
3201          -strand  => $seq->strand,
3202          -verbose => $self->verbose);
3203
3204         # convert from 1-based alignment coords!
3205         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3206         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3207         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3208         $new_seq->seq($new_dna_string);
3209         $aln->add_seq($new_seq);
3210     }
3211     return $aln;
3212 }
3213
3214
3215
3216 1;