Bio/SimpleAlign.pm

   1 # BioPerl module for SimpleAlign
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12 #
  13 #  History:
  14 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  15 #       May 2001 major rewrite - Heikki Lehvaslaiho
  16
  17 =head1 NAME
  18
  19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  20
  21 =head1 SYNOPSIS
  22
  23   # Use Bio::AlignIO to read in the alignment
  24   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  25   $aln = $str->next_aln();
  26
  27   # Describe
  28   print $aln->length;
  29   print $aln->num_residues;
  30   print $aln->is_flush;
  31   print $aln->num_sequences;
  32   print $aln->score;
  33   print $aln->percentage_identity;
  34   print $aln->consensus_string(50);
  35
  36   # Find the position in the alignment for a sequence location
  37   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  38
  39   # Extract sequences and check values for the alignment column $pos
  40   foreach $seq ($aln->each_seq) {
  41       $res = $seq->subseq($pos, $pos);
  42       $count{$res}++;
  43   }
  44   foreach $res (keys %count) {
  45       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  46   }
  47
  48   # Manipulate
  49   $aln->remove_seq($seq);
  50   $mini_aln = $aln->slice(20,30);  # get a block of columns
  51   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  52   $new_aln = $aln->remove_columns([20,30]); # remove by position
  53   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  54
  55   # Analyze
  56   $str = $aln->consensus_string($threshold_percent);
  57   $str = $aln->match_line();
  58   $str = $aln->cigar_line();
  59   $id = $aln->percentage_identity;
  60
  61   # See the module documentation for details and more methods.
  62
  63 =head1 DESCRIPTION
  64
  65 SimpleAlign is an object that handles a multiple sequence alignment
  66 (MSA). It is very permissive of types (it does not insist on sequences
  67 being all same length, for example). Think of it as a set of sequences
  68 with a whole series of built-in manipulations and methods for reading and
  69 writing alignments.
  70
  71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  72 to store its sequences. These are subsequences with a start and end
  73 positions in the parent reference sequence. Each sequence in the
  74 SimpleAlign object is a Bio::LocatableSeq.
  75
  76 SimpleAlign expects the combination of name, start, and end for a
  77 given sequence to be unique in the alignment, and this is the key for the
  78 internal hashes (name, start, end are abbreviated C<nse> in the code).
  79 However, in some cases people do not want the name/start-end to be displayed:
  80 either multiple names in an alignment or names specific to the alignment
  81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  82 C<displayname>, and generally is what is used to print out the
  83 alignment. They default to name/start-end.
  84
  85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules.  Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 I<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution. Bug reports can be submitted via the
 113 web:
 114
 115   https://redmine.open-bio.org/projects/bioperl/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@ebi.ac.uk
 120
 121 =head1 CONTRIBUTORS
 122
 123 Allen Day, allenday-at-ucla.edu,
 124 Richard Adams, Richard.Adams-at-ed.ac.uk,
 125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 127 Allen Smith, allens-at-cpan.org,
 128 Jason Stajich, jason-at-bioperl.org,
 129 Anthony Underwood, aunderwood-at-phls.org.uk,
 130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 131 Brian Osborne, bosborne at alum.mit.edu
 132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 133 Hongyu Zhang, forward at hongyu.org
 134 Jay Hannah, jay at jays.net
 135 Alexandr Bezginov, albezg at gmail.com
 136
 137 =head1 SEE ALSO
 138
 139 L<Bio::LocatableSeq>
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148 # 'Let the code begin...
 149
 150 package Bio::SimpleAlign;
 151 use vars qw(%CONSERVATION_GROUPS);
 152 use strict;
 153
 154 use Bio::LocatableSeq;  # uses Seq's as list
 155
 156 use Bio::Seq;
 157 use Bio::SeqFeature::Generic;
 158
 159 BEGIN {
 160     # This data should probably be in a more centralized module...
 161     # it is taken from Clustalw documentation.
 162     # These are all the positively scoring groups that occur in the
 163     # Gonnet Pam250 matrix. The strong and weak groups are
 164     # defined as strong score >0.5 and weak score =<0.5 respectively.
 165
 166     %CONSERVATION_GROUPS = (
 167             'strong' => [ qw(
 168                                                  STA
 169                                                  NEQK
 170                                                  NHQK
 171                                                  NDEQ
 172                                                  QHRK
 173                                                  MILV
 174                                                  MILF
 175                                                  HY
 176                                                  FYW )],
 177                                 'weak' => [ qw(
 178                       CSA
 179                                                ATV
 180                                                SAG
 181                                                STNK
 182                                                STPA
 183                                                SGND
 184                                                SNDEQK
 185                                                NDEQHK
 186                                                NEQHRK
 187                                                FVLIM
 188                                                HFY )],);
 189 }
 190
 191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 192             Bio::FeatureHolderI);
 193
 194 =head2 new
 195
 196  Title     : new
 197  Usage     : my $aln = Bio::SimpleAlign->new();
 198  Function  : Creates a new simple align object
 199  Returns   : Bio::SimpleAlign
 200  Args      : -source     => string representing the source program
 201                             where this alignment came from
 202              -annotation => Bio::AnnotationCollectionI
 203              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 204              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 205              -consensus  => consensus string
 206              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 207
 208 =cut
 209
 210
 211 sub new {
 212   my($class,@args) = @_;
 213
 214   my $self = $class->SUPER::new(@args);
 215
 216   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 217                                             SOURCE
 218                                             SCORE
 219                                             ID
 220                                             ACCESSION
 221                                             DESCRIPTION
 222                                             SEQS
 223                                             FEATURES
 224                                             ANNOTATION
 225                                             SEQ_ANNOTATION
 226                                             CONSENSUS
 227                                             CONSENSUS_META
 228                                             )], @args);
 229   $src && $self->source($src);
 230   defined $score && $self->score($score);
 231   # we need to set up internal hashs first!
 232
 233   $self->{'_seq'} = {};
 234   $self->{'_order'} = {};
 235   $self->{'_start_end_lists'} = {};
 236   $self->{'_dis_name'} = {};
 237   $self->{'_id'} = 'NoName';
 238   # maybe we should automatically read in from args. Hmmm...
 239   $id  && $self->id($id);
 240   $acc && $self->accession($acc);
 241   $desc && $self->description($desc);
 242   $coll && $self->annotation($coll);
 243   # sequence annotation is layered into a provided annotation collection (or dies)
 244   if ($sa) {
 245     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 246                  "with a sequence annotation collection")
 247         if !$coll;
 248     $coll->add_Annotation('seq_annotation', $sa);
 249   }
 250   if ($feats && ref $feats eq 'ARRAY') {
 251     for my $feat (@$feats) {
 252         $self->add_SeqFeature($feat);
 253     }
 254   }
 255   $con && $self->consensus($con);
 256   $cmeta && $self->consensus_meta($cmeta);
 257   # assumes these are in correct alignment order
 258   if ($seqs && ref($seqs) eq 'ARRAY') {
 259     for my $seq (@$seqs) {
 260         $self->add_seq($seq);
 261     }
 262   }
 263
 264   return $self; # success - we hope!
 265 }
 266
 267 =head1 Modifier methods
 268
 269 These methods modify the MSA by adding, removing or shuffling complete
 270 sequences.
 271
 272 =head2 add_seq
 273
 274  Title     : add_seq
 275  Usage     : $myalign->add_seq($newseq);
 276              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 277  Function  : Adds another sequence to the alignment. *Does not* align
 278              it - just adds it to the hashes.
 279              If -ORDER is specified, the sequence is inserted at the
 280              the position spec'd by -ORDER, and existing sequences
 281              are pushed down the storage array.
 282  Returns   : nothing
 283  Args      : A Bio::LocatableSeq object
 284              Positive integer for the sequence position (optional)
 285
 286 See L<Bio::LocatableSeq> for more information
 287
 288 =cut
 289
 290 sub addSeq {
 291     my $self = shift;
 292     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 293     $self->add_seq(@_);
 294 }
 295
 296 sub add_seq {
 297     my $self = shift;
 298     my @args = @_;
 299     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 300     my ($name,$id,$start,$end);
 301
 302     unless ($seq) {
 303         $self->throw("LocatableSeq argument required");
 304     }
 305     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 306         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 307     }
 308     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 309     $order--; # jay's patch (user-specified order is 1-origin)
 310
 311     if ($order < 0) {
 312         $self->throw("User-specified value for ORDER must be >= 1");
 313     }
 314
 315     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 316
 317     # build the symbol list for this sequence,
 318     # will prune out the gap and missing/match chars
 319     # when actually asked for the symbol list in the
 320     # symbol_chars
 321     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 322
 323     $name = $seq->get_nse;
 324
 325     if( $self->{'_seq'}->{$name} ) {
 326         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 327     }
 328     else {
 329         $self->debug( "Assigning $name to $order\n");
 330
 331     my $ordh = $self->{'_order'};
 332     if ($ordh->{$order}) {
 333         # make space to insert
 334         # $c->() returns (in reverse order) the first subsequence
 335         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 336         # (3,2,1), and $c->(2,4,5) returns (2).
 337         my $c;
 338         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 339         map {
 340      $ordh->{$_+1} = $ordh->{$_}
 341         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 342
 343     }
 344     $ordh->{$order} = $name;
 345
 346         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 347             $self->{'_start_end_lists'}->{$id} = [];
 348         }
 349         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 350     }
 351
 352     $self->{'_seq'}->{$name} = $seq;
 353
 354 }
 355
 356
 357 =head2 remove_seq
 358
 359  Title     : remove_seq
 360  Usage     : $aln->remove_seq($seq);
 361  Function  : Removes a single sequence from an alignment
 362  Returns   :
 363  Argument  : a Bio::LocatableSeq object
 364
 365 =cut
 366
 367 sub removeSeq {
 368     my $self = shift;
 369     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 370     $self->remove_seq(@_);
 371 }
 372
 373 sub remove_seq {
 374     my $self = shift;
 375     my $seq = shift;
 376     my ($name,$id);
 377
 378     $self->throw("Need Bio::Locatable seq argument ")
 379         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 380
 381     $id = $seq->id();
 382     $name = $seq->get_nse;
 383
 384     if( !exists $self->{'_seq'}->{$name} ) {
 385         $self->throw("Sequence $name does not exist in the alignment to remove!");
 386     }
 387
 388     delete $self->{'_seq'}->{$name};
 389
 390     # we need to remove this seq from the start_end_lists hash
 391
 392     if (exists $self->{'_start_end_lists'}->{$id}) {
 393         # we need to find the sequence in the array.
 394
 395         my ($i, $found);;
 396         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 397             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 398                 $found = 1;
 399                 last;
 400             }
 401         }
 402         if ($found) {
 403             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 404         }
 405         else {
 406             $self->throw("Could not find the sequence to remoce from the start-end list");
 407         }
 408     }
 409     else {
 410         $self->throw("There is no seq list for the name $id");
 411     }
 412     # we need to shift order hash
 413     my %rev_order = reverse %{$self->{'_order'}};
 414     my $no = $rev_order{$name};
 415     my $num_sequences = $self->num_sequences;
 416     for (; $no < $num_sequences; $no++) {
 417        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 418     }
 419     delete $self->{'_order'}->{$no};
 420     return 1;
 421 }
 422
 423
 424 =head2 purge
 425
 426  Title   : purge
 427  Usage   : $aln->purge(0.7);
 428  Function: Removes sequences above given sequence similarity
 429            This function will grind on large alignments. Beware!
 430  Example :
 431  Returns : An array of the removed sequences
 432  Args    : float, threshold for similarity
 433
 434 =cut
 435
 436 sub purge {
 437         my ($self,$perc) = @_;
 438         my (%duplicate, @dups);
 439
 440         my @seqs = $self->each_seq();
 441
 442         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 443                 my $seq = $seqs[$i];
 444
 445                 #skip if already in duplicate hash
 446                 next if exists $duplicate{$seq->display_id} ;
 447                 my $one = $seq->seq();
 448
 449                 my @one = split '', $one;       #split to get 1aa per array element
 450
 451                 for (my $j=$i+1;$j < @seqs;$j++) {
 452                         my $seq2 = $seqs[$j];
 453
 454                         #skip if already in duplicate hash
 455                         next if exists $duplicate{$seq2->display_id} ;
 456
 457                         my $two = $seq2->seq();
 458                         my @two = split '', $two;
 459
 460                         my $count = 0;
 461                         my $res = 0;
 462                         for (my $k=0;$k<@one;$k++) {
 463                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 464                                           $one[$k] eq $two[$k]) {
 465                                         $count++;
 466                                 }
 467                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 468                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 469                                         $res++;
 470                                 }
 471                         }
 472
 473                         my $ratio = 0;
 474                         $ratio = $count/$res unless $res == 0;
 475
 476                         # if above threshold put in duplicate hash and push onto
 477                         # duplicate array for returning to get_unique
 478                         if ( $ratio > $perc ) {
 479                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 480                                 $duplicate{$seq2->display_id} = 1;
 481                                 push @dups, $seq2;
 482                         }
 483                 }
 484         }
 485         foreach my $seq (@dups) {
 486                 $self->remove_seq($seq);
 487         }
 488         return @dups;
 489 }
 490
 491 =head2 sort_alphabetically
 492
 493  Title     : sort_alphabetically
 494  Usage     : $ali->sort_alphabetically
 495  Function  : Changes the order of the alignment to alphabetical on name
 496              followed by numerical by number.
 497  Returns   :
 498  Argument  :
 499
 500 =cut
 501
 502 sub sort_alphabetically {
 503     my $self = shift;
 504     my ($seq,$nse,@arr,%hash,$count);
 505
 506     foreach $seq ( $self->each_seq() ) {
 507         $nse = $seq->get_nse;
 508         $hash{$nse} = $seq;
 509     }
 510
 511     $count = 0;
 512
 513     %{$self->{'_order'}} = (); # reset the hash;
 514
 515     foreach $nse ( sort _alpha_startend keys %hash) {
 516         $self->{'_order'}->{$count} = $nse;
 517
 518         $count++;
 519     }
 520     1;
 521 }
 522
 523 =head2 sort_by_list
 524
 525  Title     : sort_by_list
 526  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 527  Function  : Arbitrarily order sequences in an alignment
 528  Returns   : A new Bio::SimpleAlign object
 529  Argument  : a file listing sequence names in intended order (one name per line)
 530
 531 =cut
 532
 533 sub sort_by_list {
 534     my ($self, $list) = @_;
 535     my (@seq, @ids, %order);
 536
 537     foreach my $seq ( $self->each_seq() ) {
 538         push @seq, $seq;
 539         push @ids, $seq->display_id;
 540     }
 541
 542     my $ct=1;
 543     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 544     while (<$listfh>) {
 545       chomp;
 546       my $name=$_;
 547       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 548       $order{$name}=$ct++;
 549     }
 550     close($listfh);
 551
 552     # use the map-sort-map idiom:
 553     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 554     my $aln = $self->new;
 555     foreach (@sorted) { $aln->add_seq($_) }
 556     return $aln;
 557 }
 558
 559 =head2 set_new_reference
 560
 561  Title     : set_new_reference
 562  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 563              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 564              as the reference (1st) sequence
 565  Function  : Change/Set a new reference (i.e., the first) sequence
 566  Returns   : a new Bio::SimpleAlign object.
 567              Throws an exception if designated sequence not found
 568  Argument  : a positive integer of sequence order, or a sequence name
 569              in the original alignment
 570
 571 =cut
 572
 573 sub set_new_reference {
 574     my ($self, $seqid) = @_;
 575     my $aln = $self->new;
 576     my (@seq, @ids, @new_seq);
 577     my $is_num=0;
 578     foreach my $seq ( $self->each_seq() ) {
 579         push @seq, $seq;
 580         push @ids, $seq->display_id;
 581     }
 582
 583     if ($seqid =~ /^\d+$/) { # argument is seq position
 584         $is_num=1;
 585         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 586     } else { # argument is a seq name
 587         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 588     }
 589
 590     for (my $i=0; $i<=$#seq; $i++) {
 591         my $pos=$i+1;
 592         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 593             unshift @new_seq, $seq[$i];
 594         } else {
 595             push @new_seq, $seq[$i];
 596         }
 597     }
 598     foreach (@new_seq) { $aln->add_seq($_);  }
 599     return $aln;
 600 }
 601
 602 sub _in_aln {  # check if input name exists in the alignment
 603     my ($str, $ref) = @_;
 604     foreach (@$ref) {
 605         return 1 if $str eq $_;
 606     }
 607     return 0;
 608 }
 609
 610
 611 =head2 uniq_seq
 612
 613  Title     : uniq_seq
 614  Usage     : $aln->uniq_seq():  Remove identical sequences in
 615              in the alignment.  Ambiguous base ("N", "n") and
 616              leading and ending gaps ("-") are NOT counted as
 617              differences.
 618  Function  : Make a new alignment of unique sequence types (STs)
 619  Returns   : 1a. if called in a scalar context,
 620                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 621              1b. if called in an array context,
 622                 a new Bio::SimpleAlign object, and a hashref whose keys
 623                 are sequence types, and whose values are arrayrefs to
 624                 lists of sequence ids within the corresponding sequence type
 625              2. if $aln->verbose > 0, ST of each sequence is sent to
 626                 STDERR (in a tabular format)
 627  Argument  : None
 628
 629 =cut
 630
 631 sub uniq_seq {
 632     my ($self, $seqid) = @_;
 633     my $aln = $self->new;
 634     my (%member, %order, @seq, @uniq_str, $st);
 635     my $order=0;
 636     my $len = $self->length();
 637     $st = {};
 638     foreach my $seq ( $self->each_seq() ) {
 639         my $str = $seq->seq();
 640
 641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 642 # comparing two sequence strings
 643
 644     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 645         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 646     # 2nd, convert leading and ending gaps to "?":
 647         $str = &_convert_leading_ending_gaps($str, '-', '?');
 648     # Note that '?' also can mean unknown residue.
 649     # I don't like making global class member changes like this, too
 650     # prone to errors... -- cjfields 08-11-18
 651     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 652         my $new = Bio::LocatableSeq->new(
 653                      -id      => $seq->id(),
 654                                          -alphabet=> $seq->alphabet,
 655                                          -seq     => $str,
 656                                          -start   => $seq->start,
 657                                          -end     => $seq->end
 658                                          );
 659         push @seq, $new;
 660     }
 661
 662     foreach my $seq (@seq) {
 663         my $str = $seq->seq();
 664         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 665         if ($seen) { # seen before
 666             my @memb = @{$member{$key}};
 667             push @memb, $seq;
 668             $member{$key} = \@memb;
 669         } else {  # not seen
 670             push @uniq_str, $key;
 671             $order++;
 672             $member{$key} = [ ($seq) ];
 673             $order{$key} = $order;
 674         }
 675     }
 676
 677     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 679         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 681         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 682         my $gap='-';
 683         my $end= CORE::length($str2);
 684         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 685         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 686                                          -seq  =>$str2,
 687                                          -start=>1,
 688                                          -end  =>$end
 689                                          );
 690         $aln->add_seq($new);
 691         foreach (@{$member{$str}}) {
 692             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 693         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 694         }
 695     }
 696     return wantarray ? ($aln, $st) : $aln;
 697 }
 698
 699 sub _check_uniq {  # check if same seq exists in the alignment
 700     my ($str1, $ref, $length) = @_;
 701     my @char1=split //, $str1;
 702     my @array=@$ref;
 703
 704     return (0, $str1) if @array==0; # not seen (1st sequence)
 705
 706     foreach my $str2 (@array) {
 707         my $diff=0;
 708         my @char2=split //, $str2;
 709         for (my $i=0; $i<=$length-1; $i++) {
 710             next if $char1[$i] eq '?';
 711             next if $char2[$i] eq '?';
 712             $diff++ if $char1[$i] ne $char2[$i];
 713         }
 714         return (1, $str2) if $diff == 0;  # seen before
 715     }
 716
 717     return (0, $str1); # not seen
 718 }
 719
 720 sub _convert_leading_ending_gaps {
 721     my $s=shift;
 722     my $sym1=shift;
 723     my $sym2=shift;
 724     my @array=split //, $s;
 725 # convert leading char:
 726     for (my $i=0; $i<=$#array; $i++) {
 727         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 728     }
 729 # convert ending char:
 730     for (my $i = $#array; $i>= 0; $i--) {
 731         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 732     }
 733     my $s_new=join '', @array;
 734     return $s_new;
 735 }
 736
 737 =head1 Sequence selection methods
 738
 739 Methods returning one or more sequences objects.
 740
 741 =head2 each_seq
 742
 743  Title     : each_seq
 744  Usage     : foreach $seq ( $align->each_seq() )
 745  Function  : Gets a Seq object from the alignment
 746  Returns   : Seq object
 747  Argument  :
 748
 749 =cut
 750
 751 sub eachSeq {
 752     my $self = shift;
 753     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 754     $self->each_seq();
 755 }
 756
 757 sub each_seq {
 758         my $self = shift;
 759         my (@arr,$order);
 760
 761         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 762                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 763                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 764                 }
 765         }
 766         return @arr;
 767 }
 768
 769
 770 =head2 each_alphabetically
 771
 772  Title     : each_alphabetically
 773  Usage     : foreach $seq ( $ali->each_alphabetically() )
 774  Function  : Returns a sequence object, but the objects are returned
 775              in alphabetically sorted order.
 776              Does not change the order of the alignment.
 777  Returns   : Seq object
 778  Argument  :
 779
 780 =cut
 781
 782 sub each_alphabetically {
 783         my $self = shift;
 784         my ($seq,$nse,@arr,%hash,$count);
 785
 786         foreach $seq ( $self->each_seq() ) {
 787                 $nse = $seq->get_nse;
 788                 $hash{$nse} = $seq;
 789         }
 790
 791         foreach $nse ( sort _alpha_startend keys %hash) {
 792                 push(@arr,$hash{$nse});
 793         }
 794         return @arr;
 795 }
 796
 797 sub _alpha_startend {
 798     my ($aname,$astart,$bname,$bstart);
 799     ($aname,$astart) = split (/-/,$a);
 800     ($bname,$bstart) = split (/-/,$b);
 801
 802     if( $aname eq $bname ) {
 803         return $astart <=> $bstart;
 804     }
 805     else {
 806         return $aname cmp $bname;
 807     }
 808 }
 809
 810 =head2 each_seq_with_id
 811
 812  Title     : each_seq_with_id
 813  Usage     : foreach $seq ( $align->each_seq_with_id() )
 814  Function  : Gets a Seq objects from the alignment, the contents
 815              being those sequences with the given name (there may be
 816              more than one)
 817  Returns   : Seq object
 818  Argument  : a seq name
 819
 820 =cut
 821
 822 sub eachSeqWithId {
 823     my $self = shift;
 824     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 825     $self->each_seq_with_id(@_);
 826 }
 827
 828 sub each_seq_with_id {
 829     my $self = shift;
 830     my $id = shift;
 831
 832     $self->throw("Method each_seq_with_id needs a sequence name argument")
 833         unless defined $id;
 834
 835     my (@arr, $seq);
 836
 837     if (exists($self->{'_start_end_lists'}->{$id})) {
 838         @arr = @{$self->{'_start_end_lists'}->{$id}};
 839     }
 840     return @arr;
 841 }
 842
 843 =head2 get_seq_by_pos
 844
 845  Title     : get_seq_by_pos
 846  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 847  Function  : Gets a sequence based on its position in the alignment.
 848              Numbering starts from 1.  Sequence positions larger than
 849              num_sequences() will thow an error.
 850  Returns   : a Bio::LocatableSeq object
 851  Args      : positive integer for the sequence position
 852
 853 =cut
 854
 855 sub get_seq_by_pos {
 856
 857     my $self = shift;
 858     my ($pos) = @_;
 859
 860     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 861         unless $pos =~ /^\d+$/ and $pos > 0;
 862     $self->throw("No sequence at position [$pos]")
 863         unless $pos <= $self->num_sequences ;
 864
 865     my $nse = $self->{'_order'}->{--$pos};
 866     return $self->{'_seq'}->{$nse};
 867 }
 868
 869 =head2 get_seq_by_id
 870
 871  Title     : get_seq_by_id
 872  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 873  Function  : Gets a sequence based on its name.
 874              Sequences that do not exist will warn and return undef
 875  Returns   : a Bio::LocatableSeq object
 876  Args      : string for sequence name
 877
 878 =cut
 879
 880 sub get_seq_by_id {
 881     my ($self,$name) = @_;
 882     unless( defined $name ) {
 883       $self->warn("Must provide a sequence name");
 884       return;
 885     }
 886     for my $seq ( values %{$self->{'_seq'}} ) {
 887       if ( $seq->id eq $name) {
 888         return $seq;
 889       }
 890     }
 891     return;
 892 }
 893
 894 =head2 seq_with_features
 895
 896  Title   : seq_with_features
 897  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 898                                           -consensus => 60
 899                                           -mask =>
 900            sub { my $consensus = shift;
 901
 902                  for my $i (1..5){
 903                     my $n = 'N' x $i;
 904                     my $q = '\?' x $i;
 905                     while($consensus =~ /[^?]$q[^?]/){
 906                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 907                     }
 908                   }
 909                  return $consensus;
 910                }
 911                                          );
 912  Function: produces a Bio::Seq object by first splicing gaps from -pos
 913            (by means of a splice_by_seq_pos() call), then creating
 914            features using non-? chars (by means of a consensus_string()
 915            call with stringency -consensus).
 916  Returns : a Bio::Seq object
 917  Args    : -pos : required. sequence from which to build the Bio::Seq
 918              object
 919            -consensus : optional, defaults to consensus_string()'s
 920              default cutoff value
 921            -mask : optional, a coderef to apply to consensus_string()'s
 922              output before building features.  this may be useful for
 923              closing gaps of 1 bp by masking over them with N, for
 924              instance
 925
 926 =cut
 927
 928 sub seq_with_features{
 929    my ($self,%arg) = @_;
 930
 931    #first do the preparatory splice
 932    $self->throw("must provide a -pos argument") unless $arg{-pos};
 933    $self->splice_by_seq_pos($arg{-pos});
 934
 935    my $consensus_string = $self->consensus_string($arg{-consensus});
 936    $consensus_string = $arg{-mask}->($consensus_string)
 937          if defined($arg{-mask});
 938
 939    my(@bs,@es);
 940
 941    push @bs, 1 if $consensus_string =~ /^[^?]/;
 942
 943    while($consensus_string =~ /\?[^?]/g){
 944          push @bs, pos($consensus_string);
 945    }
 946    while($consensus_string =~ /[^?]\?/g){
 947          push @es, pos($consensus_string);
 948    }
 949
 950    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 951
 952    my $seq = Bio::Seq->new();
 953
 954 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 955 #                -source_tag => 'location',
 956 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 957 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 958 #                                                  );
 959 #   $seq->add_SeqFeature($rootfeature);
 960
 961    while(my $b = shift @bs){
 962          my $e = shift @es;
 963          $seq->add_SeqFeature(
 964        Bio::SeqFeature::Generic->new(
 965          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 966          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 967          -source_tag => $self->source || 'MSA',
 968        )
 969      );
 970    }
 971
 972    return $seq;
 973 }
 974
 975
 976 =head1 Create new alignments
 977
 978 The result of these methods are horizontal or vertical subsets of the
 979 current MSA.
 980
 981 =head2 select
 982
 983  Title     : select
 984  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 985  Function  : Creates a new alignment from a continuous subset of
 986              sequences.  Numbering starts from 1.  Sequence positions
 987              larger than num_sequences() will thow an error.
 988  Returns   : a Bio::SimpleAlign object
 989  Args      : positive integer for the first sequence
 990              positive integer for the last sequence to include (optional)
 991
 992 =cut
 993
 994 sub select {
 995     my $self = shift;
 996     my ($start, $end) = @_;
 997
 998     $self->throw("Select start has to be a positive integer, not [$start]")
 999         unless $start =~ /^\d+$/ and $start > 0;
1000     $self->throw("Select end has to be a positive integer, not [$end]")
1001         unless $end  =~ /^\d+$/ and $end > 0;
1002     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003         unless $start <= $end;
1004
1005     my $aln = $self->new;
1006     foreach my $pos ($start .. $end) {
1007         $aln->add_seq($self->get_seq_by_pos($pos));
1008     }
1009     $aln->id($self->id);
1010     # fix for meta, sf, ann
1011     return $aln;
1012 }
1013
1014 =head2 select_noncont
1015
1016  Title     : select_noncont
1017  Usage     : # 1st and 3rd sequences, sorted
1018              $aln2 = $aln->select_noncont(1, 3)
1019
1020              # 1st and 3rd sequences, sorted (same as first)
1021              $aln2 = $aln->select_noncont(3, 1)
1022
1023              # 1st and 3rd sequences, unsorted
1024              $aln2 = $aln->select_noncont('nosort',3, 1)
1025
1026  Function  : Creates a new alignment from a subset of sequences.  Numbering
1027              starts from 1.  Sequence positions larger than num_sequences() will
1028              throw an error.  Sorts the order added to new alignment by default,
1029              to prevent sorting pass 'nosort' as the first argument in the list.
1030  Returns   : a Bio::SimpleAlign object
1031  Args      : array of integers for the sequences.  If the string 'nosort' is
1032              passed as the first argument, the sequences will not be sorted
1033              in the new alignment but will appear in the order listed.
1034
1035 =cut
1036
1037 sub select_noncont {
1038         my $self = shift;
1039     my $nosort = 0;
1040         my (@pos) = @_;
1041     if ($pos[0] !~ m{^\d+$}) {
1042         my $sortcmd = shift @pos;
1043         if ($sortcmd eq 'nosort') {
1044             $nosort = 1;
1045         } else {
1046             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1047         }
1048     }
1049
1050     my $end = $self->num_sequences;
1051     foreach ( @pos ) {
1052                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1054         }
1055
1056         @pos = sort {$a <=> $b} @pos unless $nosort;
1057
1058         my $aln = $self->new;
1059         foreach my $p (@pos) {
1060                 $aln->add_seq($self->get_seq_by_pos($p));
1061         }
1062         $aln->id($self->id);
1063     # fix for meta, sf, ann
1064         return $aln;
1065 }
1066
1067 =head2 select_noncont_by_name
1068
1069  Title     : select_noncont_by_name
1070  Usage     : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1071  Function  : Creates a new alignment from a subset of sequences which are
1072              selected by name (sequence ID).
1073  Returns   : a Bio::SimpleAlign object
1074  Args      : array of names (i.e., identifiers) for the sequences.
1075
1076 =cut
1077
1078 sub select_noncont_by_name {
1079     my ($self, @names) = @_;
1080
1081     my $aln = $self->new;
1082     foreach my $name (@names) {
1083         $aln->add_seq($self->get_seq_by_id($name));
1084     }
1085     $aln->id($self->id);
1086
1087     return $aln;
1088 }
1089
1090 =head2 slice
1091
1092  Title     : slice
1093  Usage     : $aln2 = $aln->slice(20,30)
1094  Function  : Creates a slice from the alignment inclusive of start and
1095              end columns, and the first column in the alignment is denoted 1.
1096              Sequences with no residues in the slice are excluded from the
1097              new alignment and a warning is printed. Slice beyond the length of
1098              the sequence does not do padding.
1099  Returns   : A Bio::SimpleAlign object
1100  Args      : Positive integer for start column, positive integer for end column,
1101              optional boolean which if true will keep gap-only columns in the newly
1102              created slice. Example:
1103
1104              $aln2 = $aln->slice(20,30,1)
1105
1106 =cut
1107
1108 sub slice {
1109         my $self = shift;
1110         my ($start, $end, $keep_gap_only) = @_;
1111
1112         $self->throw("Slice start has to be a positive integer, not [$start]")
1113           unless $start =~ /^\d+$/ and $start > 0;
1114         $self->throw("Slice end has to be a positive integer, not [$end]")
1115           unless $end =~ /^\d+$/ and $end > 0;
1116         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1117           unless $start <= $end;
1118         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1119                                          "[$start] is too big.") if $start > $self->length;
1120     my $cons_meta = $self->consensus_meta;
1121         my $aln = $self->new;
1122         $aln->id($self->id);
1123         foreach my $seq ( $self->each_seq() ) {
1124             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1125             Bio::Seq::Meta->new
1126         (-id      => $seq->id,
1127                  -alphabet => $seq->alphabet,
1128                  -strand  => $seq->strand,
1129                  -verbose => $self->verbose) :
1130             Bio::LocatableSeq->new
1131         (-id      => $seq->id,
1132                  -alphabet => $seq->alphabet,
1133                  -strand  => $seq->strand,
1134                  -verbose => $self->verbose);
1135
1136             # seq
1137             my $seq_end = $end;
1138             $seq_end = $seq->length if( $end > $seq->length );
1139
1140             my $slice_seq = $seq->subseq($start, $seq_end);
1141             $new_seq->seq( $slice_seq );
1142
1143             $slice_seq =~ s/\W//g;
1144
1145             if ($start > 1) {
1146             my $pre_start_seq = $seq->subseq(1, $start - 1);
1147             $pre_start_seq =~ s/\W//g;
1148             if (!defined($seq->strand)) {
1149                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1150             } elsif ($seq->strand < 0){
1151                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1152             } else {
1153                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1154             }
1155             } else {
1156             if ((defined $seq->strand)&&($seq->strand < 0)){
1157                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1158             } else {
1159                $new_seq->start( $seq->start);
1160             }
1161             }
1162         if ($new_seq->isa('Bio::Seq::MetaI')) {
1163             for my $meta_name ($seq->meta_names) {
1164                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1165             }
1166         }
1167             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1168
1169             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1170             $aln->add_seq($new_seq);
1171             } else {
1172             if( $keep_gap_only ) {
1173                 $aln->add_seq($new_seq);
1174             } else {
1175                 my $nse = $seq->get_nse();
1176                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1177                     " Sequence excluded from the new alignment.");
1178             }
1179             }
1180         }
1181     if ($cons_meta) {
1182         my $new = Bio::Seq::Meta->new();
1183         for my $meta_name ($cons_meta->meta_names) {
1184             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1185         }
1186         $aln->consensus_meta($new);
1187     }
1188     $aln->annotation($self->annotation);
1189     # fix for meta, sf, ann
1190         return $aln;
1191 }
1192
1193 =head2 remove_columns
1194
1195  Title     : remove_columns
1196  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1197              $aln2 = $aln->remove_columns([0,0],[6,8])
1198  Function  : Creates an aligment with columns removed corresponding to
1199              the specified type or by specifying the columns by number.
1200  Returns   : Bio::SimpleAlign object
1201  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1202              'all_gaps_columns') or array ref where the referenced array
1203              contains a pair of integers that specify a range.
1204              The first column is 0
1205
1206 =cut
1207
1208 sub remove_columns {
1209     my ($self,@args) = @_;
1210     @args || $self->throw("Must supply column ranges or column types");
1211     my $aln;
1212
1213     if ($args[0][0] =~ /^[a-z_]+$/i) {
1214         $aln = $self->_remove_columns_by_type($args[0]);
1215     } elsif ($args[0][0] =~ /^\d+$/) {
1216         $aln = $self->_remove_columns_by_num(\@args);
1217     } else {
1218         $self->throw("You must pass array references to remove_columns(), not @args");
1219     }
1220     # fix for meta, sf, ann
1221     $aln;
1222 }
1223
1224
1225 =head2 remove_gaps
1226
1227  Title     : remove_gaps
1228  Usage     : $aln2 = $aln->remove_gaps
1229  Function  : Creates an aligment with gaps removed
1230  Returns   : a Bio::SimpleAlign object
1231  Args      : a gap character(optional) if none specified taken
1232                 from $self->gap_char,
1233              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1234                         indicates that only all-gaps columns should be deleted
1235
1236 Used from method L<remove_columns> in most cases. Set gap character
1237 using L<gap_char()|gap_char>.
1238
1239 =cut
1240
1241 sub remove_gaps {
1242     my ($self,$gapchar,$all_gaps_columns) = @_;
1243     my $gap_line;
1244     if ($all_gaps_columns) {
1245         $gap_line = $self->all_gap_line($gapchar);
1246     } else {
1247         $gap_line = $self->gap_line($gapchar);
1248     }
1249     my $aln = $self->new;
1250
1251     my @remove;
1252     my $length = 0;
1253     my $del_char = $gapchar || $self->gap_char;
1254     # Do the matching to get the segments to remove
1255     while ($gap_line =~ m/[$del_char]/g) {
1256         my $start = pos($gap_line)-1;
1257         $gap_line=~/\G[$del_char]+/gc;
1258         my $end = pos($gap_line)-1;
1259
1260         #have to offset the start and end for subsequent removes
1261         $start-=$length;
1262         $end  -=$length;
1263         $length += ($end-$start+1);
1264         push @remove, [$start,$end];
1265     }
1266
1267     #remove the segments
1268     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1269     # fix for meta, sf, ann
1270     return $aln;
1271 }
1272
1273
1274 sub _remove_col {
1275     my ($self,$aln,$remove) = @_;
1276     my @new;
1277
1278     my $gap = $self->gap_char;
1279
1280     # splice out the segments and create new seq
1281     foreach my $seq($self->each_seq){
1282         my $new_seq = Bio::LocatableSeq->new(
1283                                              -id      => $seq->id,
1284                                              -alphabet=> $seq->alphabet,
1285                                              -strand  => $seq->strand,
1286                                              -verbose => $self->verbose);
1287         my $sequence = $seq->seq;
1288         foreach my $pair(@{$remove}){
1289             my $start = $pair->[0];
1290             my $end   = $pair->[1];
1291             $sequence = $seq->seq unless $sequence;
1292             my $orig = $sequence;
1293             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1294             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1295             $sequence = $head.$tail;
1296             # start
1297             unless (defined $new_seq->start) {
1298                 if ($start == 0) {
1299                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1300                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1301                 }
1302                 else {
1303                     my $start_adjust = $orig =~ /^$gap+/;
1304                     if ($start_adjust) {
1305                         $start_adjust = $+[0] == $start;
1306                     }
1307                     $new_seq->start($seq->start + $start_adjust);
1308                 }
1309             }
1310             # end
1311             if (($end + 1) >= CORE::length($orig)) {
1312                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1313                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1314             }
1315             else {
1316                 $new_seq->end($seq->end);
1317             }
1318         }
1319
1320         if ($new_seq->end < $new_seq->start) {
1321             # we removed all columns except for gaps: set to 0 to indicate no
1322             # sequence
1323             $new_seq->start(0);
1324             $new_seq->end(0);
1325         }
1326
1327         $new_seq->seq($sequence) if $sequence;
1328                 push @new, $new_seq;
1329     }
1330     # add the new seqs to the alignment
1331     foreach my $new(@new){
1332         $aln->add_seq($new);
1333     }
1334     # fix for meta, sf, ann
1335     return $aln;
1336 }
1337
1338 sub _remove_columns_by_type {
1339         my ($self,$type) = @_;
1340         my $aln = $self->new;
1341         my @remove;
1342
1343         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1344         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1345         my %matchchars = ( 'match'           => '\*',
1346                        'weak'             => '\.',
1347                        'strong'           => ':',
1348                        'mismatch'         => ' ',
1349                        'gaps'             => '',
1350                        'all_gaps_columns' => ''
1351                      );
1352         # get the characters to delete against
1353         my $del_char;
1354         foreach my $type (@{$type}){
1355                 $del_char.= $matchchars{$type};
1356         }
1357
1358         my $length = 0;
1359         my $match_line = $self->match_line;
1360         # do the matching to get the segments to remove
1361         if($del_char){
1362                 while($match_line =~ m/[$del_char]/g ){
1363                         my $start = pos($match_line)-1;
1364                         $match_line=~/\G[$del_char]+/gc;
1365                         my $end = pos($match_line)-1;
1366
1367                         #have to offset the start and end for subsequent removes
1368                         $start-=$length;
1369                         $end  -=$length;
1370                         $length += ($end-$start+1);
1371                         push @remove, [$start,$end];
1372                 }
1373         }
1374
1375         # remove the segments
1376         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1377         $aln = $aln->remove_gaps() if $gap;
1378         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1379     # fix for meta, sf, ann
1380         $aln;
1381 }
1382
1383
1384 sub _remove_columns_by_num {
1385         my ($self,$positions) = @_;
1386         my $aln = $self->new;
1387
1388         # sort the positions
1389         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1390
1391     my @remove;
1392     my $length = 0;
1393     foreach my $pos (@{$positions}) {
1394         my ($start, $end) = @{$pos};
1395
1396         #have to offset the start and end for subsequent removes
1397         $start-=$length;
1398         $end  -=$length;
1399         $length += ($end-$start+1);
1400         push @remove, [$start,$end];
1401     }
1402
1403     #remove the segments
1404     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1405     # fix for meta, sf, ann
1406         $aln;
1407 }
1408
1409
1410 =head1 Change sequences within the MSA
1411
1412 These methods affect characters in all sequences without changing the
1413 alignment.
1414
1415 =head2 splice_by_seq_pos
1416
1417  Title   : splice_by_seq_pos
1418  Usage   : $status = splice_by_seq_pos(1);
1419  Function: splices all aligned sequences where the specified sequence
1420            has gaps.
1421  Example :
1422  Returns : 1 on success
1423  Args    : position of sequence to splice by
1424
1425
1426 =cut
1427
1428 sub splice_by_seq_pos{
1429   my ($self,$pos) = @_;
1430
1431   my $guide = $self->get_seq_by_pos($pos);
1432   my $guide_seq = $guide->seq;
1433
1434   $guide_seq =~ s/\./\-/g;
1435
1436   my @gaps = ();
1437   $pos = -1;
1438   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1439     unshift @gaps, $pos;
1440     $pos++;
1441   }
1442
1443   foreach my $seq ($self->each_seq){
1444     my @bases = split '', $seq->seq;
1445
1446     splice(@bases, $_, 1) foreach @gaps;
1447     $seq->seq(join('', @bases));
1448   }
1449
1450   1;
1451 }
1452
1453 =head2 map_chars
1454
1455  Title     : map_chars
1456  Usage     : $ali->map_chars('\.','-')
1457  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1458              characters
1459
1460              Notice that the from (arg1) is interpretted as a regex,
1461              so be careful about quoting meta characters (eg
1462              $ali->map_chars('.','-') wont do what you want)
1463  Returns   :
1464  Argument  : 'from' rexexp
1465              'to' string
1466
1467 =cut
1468
1469 sub map_chars {
1470     my $self = shift;
1471     my $from = shift;
1472     my $to   = shift;
1473     my ($seq,$temp);
1474
1475     $self->throw("Need exactly two arguments")
1476         unless defined $from and defined $to;
1477
1478     foreach $seq ( $self->each_seq() ) {
1479         $temp = $seq->seq();
1480         $temp =~ s/$from/$to/g;
1481         $seq->seq($temp);
1482     }
1483     return 1;
1484 }
1485
1486
1487 =head2 uppercase
1488
1489  Title     : uppercase()
1490  Usage     : $ali->uppercase()
1491  Function  : Sets all the sequences to uppercase
1492  Returns   :
1493  Argument  :
1494
1495 =cut
1496
1497 sub uppercase {
1498     my $self = shift;
1499     my $seq;
1500     my $temp;
1501
1502     foreach $seq ( $self->each_seq() ) {
1503       $temp = $seq->seq();
1504       $temp =~ tr/[a-z]/[A-Z]/;
1505
1506       $seq->seq($temp);
1507     }
1508     return 1;
1509 }
1510
1511 =head2 cigar_line
1512
1513  Title    : cigar_line()
1514  Usage    : %cigars = $align->cigar_line()
1515  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1516             Report) line for each sequence in the alignment. Examples are
1517             "1,60" or "5,10:12,58", where the numbers refer to conserved
1518             positions within the alignment. The keys of the hash are the
1519             NSEs (name/start/end) assigned to each sequence.
1520  Args     : threshold (optional, defaults to 100)
1521  Returns  : Hash of strings (cigar lines)
1522
1523 =cut
1524
1525 sub cigar_line {
1526         my $self = shift;
1527         my $thr=shift||100;
1528         my %cigars;
1529
1530         my @consensus = split "",($self->consensus_string($thr));
1531         my $len = $self->length;
1532         my $gapchar = $self->gap_char;
1533
1534         # create a precursor, something like (1,4,5,6,7,33,45),
1535         # where each number corresponds to a conserved position
1536         foreach my $seq ( $self->each_seq ) {
1537                 my @seq = split "", uc ($seq->seq);
1538                 my $pos = 1;
1539                 for (my $x = 0 ; $x < $len ; $x++ ) {
1540                         if ($seq[$x] eq $consensus[$x]) {
1541                                 push @{$cigars{$seq->get_nse}},$pos;
1542                                 $pos++;
1543                         } elsif ($seq[$x] ne $gapchar) {
1544                                 $pos++;
1545                         }
1546                 }
1547         }
1548         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1549         for my $name (keys %cigars) {
1550                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1551                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1552       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1553            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1554                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1555                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1556                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1557                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1558                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1559                         }
1560       }
1561         }
1562   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1563   for my $name (keys %cigars) {
1564           my @remove;
1565           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1566                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1567                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1568                       unshift @remove,$x;
1569               }
1570            }
1571       for my $pos (@remove) {
1572                         splice @{$cigars{$name}}, $pos, 1;
1573            }
1574    }
1575    # join and punctuate
1576    for my $name (keys %cigars) {
1577           my ($start,$end,$str) = "";
1578           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1579                   $str .= ($start . "," . $end . ":");
1580           }
1581           $str =~ s/:$//;
1582       $cigars{$name} = $str;
1583    }
1584    %cigars;
1585 }
1586
1587
1588 =head2 match_line
1589
1590  Title    : match_line()
1591  Usage    : $line = $align->match_line()
1592  Function : Generates a match line - much like consensus string
1593             except that a line indicating the '*' for a match.
1594  Args     : (optional) Match line characters ('*' by default)
1595             (optional) Strong match char (':' by default)
1596             (optional) Weak match char ('.' by default)
1597  Returns  : String
1598
1599 =cut
1600
1601 sub match_line {
1602         my ($self,$matchlinechar, $strong, $weak) = @_;
1603         my %matchchars = ('match'    => $matchlinechar || '*',
1604                                                           'weak'     => $weak          || '.',
1605                                                           'strong'   => $strong        || ':',
1606                                                           'mismatch' => ' ',
1607                                                   );
1608
1609         my @seqchars;
1610         my $alphabet;
1611         foreach my $seq ( $self->each_seq ) {
1612                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1613                 $alphabet = $seq->alphabet unless defined $alphabet;
1614         }
1615         my $refseq = shift @seqchars;
1616         # let's just march down the columns
1617         my $matchline;
1618  POS:
1619         foreach my $pos ( 0..$self->length ) {
1620                 my $refchar = $refseq->[$pos];
1621                 my $char = $matchchars{'mismatch'};
1622                 unless( defined $refchar ) {
1623                         last if $pos == $self->length; # short circuit on last residue
1624                         # this in place to handle jason's soon-to-be-committed
1625                         # intron mapping code
1626                         goto bottom;
1627                 }
1628                 my %col = ($refchar => 1);
1629                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1630                 foreach my $seq ( @seqchars ) {
1631                         next if $pos >= scalar @$seq;
1632                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1633                                                           $seq->[$pos] eq ' ' );
1634                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1635                 }
1636                 my @colresidues = sort keys %col;
1637
1638                 # if all the values are the same
1639                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1640                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1641                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1642                         # matches for protein seqs
1643             TYPE:
1644                         foreach my $type ( qw(strong weak) ) {
1645                                 # iterate through categories
1646                                 my %groups;
1647                                 # iterate through each of the aa in the col
1648                                 # look to see which groups it is in
1649                                 foreach my $c ( @colresidues ) {
1650                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1651                                                 push @{$groups{$f}},$c;
1652                                         }
1653                                 }
1654                          GRP:
1655                                 foreach my $cols ( values %groups ) {
1656                                         @$cols = sort @$cols;
1657                                         # now we are just testing to see if two arrays
1658                                         # are identical w/o changing either one
1659                                         # have to be same len
1660                                         next if( scalar @$cols != scalar @colresidues );
1661                                         # walk down the length and check each slot
1662                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1663                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1664                                         }
1665                                         $char = $matchchars{$type};
1666                                         last TYPE;
1667                                 }
1668                         }
1669                 }
1670          bottom:
1671                 $matchline .= $char;
1672         }
1673         return $matchline;
1674 }
1675
1676
1677 =head2 gap_line
1678
1679  Title    : gap_line()
1680  Usage    : $line = $align->gap_line()
1681  Function : Generates a gap line - much like consensus string
1682             except that a line where '-' represents gap
1683  Args     : (optional) gap line characters ('-' by default)
1684  Returns  : string
1685
1686 =cut
1687
1688 sub gap_line {
1689     my ($self,$gapchar) = @_;
1690     $gapchar = $gapchar || $self->gap_char;
1691     my %gap_hsh; # column gaps vector
1692     foreach my $seq ( $self->each_seq ) {
1693                 my $i = 0;
1694         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1695                   map {[$i++, $_]} split(//, uc ($seq->seq));
1696     }
1697     my $gap_line;
1698     foreach my $pos ( 0..$self->length-1 ) {
1699           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1700     }
1701     return $gap_line;
1702 }
1703
1704 =head2 all_gap_line
1705
1706  Title    : all_gap_line()
1707  Usage    : $line = $align->all_gap_line()
1708  Function : Generates a gap line - much like consensus string
1709             except that a line where '-' represents all-gap column
1710  Args     : (optional) gap line characters ('-' by default)
1711  Returns  : string
1712
1713 =cut
1714
1715 sub all_gap_line {
1716     my ($self,$gapchar) = @_;
1717     $gapchar = $gapchar || $self->gap_char;
1718     my %gap_hsh;                # column gaps counter hash
1719     my @seqs = $self->each_seq;
1720     foreach my $seq ( @seqs ) {
1721         my $i = 0;
1722         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1723         map {[$i++, $_]} split(//, uc ($seq->seq));
1724     }
1725     my $gap_line;
1726     foreach my $pos ( 0..$self->length-1 ) {
1727         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1728             # gaps column
1729             $gap_line .= $gapchar;
1730         } else {
1731             $gap_line .= '.';
1732         }
1733     }
1734     return $gap_line;
1735 }
1736
1737 =head2 gap_col_matrix
1738
1739  Title    : gap_col_matrix()
1740  Usage    : my $cols = $align->gap_col_matrix()
1741  Function : Generates an array of hashes where
1742             each entry in the array is a hash reference
1743             with keys of all the sequence names and
1744             and value of 1 or 0 if the sequence has a gap at that column
1745  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1746
1747 =cut
1748
1749 sub gap_col_matrix {
1750     my ($self,$gapchar) = @_;
1751     $gapchar = $gapchar || $self->gap_char;
1752     my %gap_hsh; # column gaps vector
1753     my @cols;
1754     foreach my $seq ( $self->each_seq ) {
1755         my $i = 0;
1756         my $str = $seq->seq;
1757         my $len = $seq->length;
1758         my $ch;
1759         my $id = $seq->display_id;
1760         while( $i < $len ) {
1761             $ch = substr($str, $i, 1);
1762             $cols[$i++]->{$id} = ($ch eq $gapchar);
1763         }
1764     }
1765     return \@cols;
1766 }
1767
1768 =head2 match
1769
1770  Title     : match()
1771  Usage     : $ali->match()
1772  Function  : Goes through all columns and changes residues that are
1773              identical to residue in first sequence to match '.'
1774              character. Sets match_char.
1775
1776              USE WITH CARE: Most MSA formats do not support match
1777              characters in sequences, so this is mostly for output
1778              only. NEXUS format (Bio::AlignIO::nexus) can handle
1779              it.
1780  Returns   : 1
1781  Argument  : a match character, optional, defaults to '.'
1782
1783 =cut
1784
1785 sub match {
1786     my ($self, $match) = @_;
1787
1788     $match ||= '.';
1789     my ($matching_char) = $match;
1790     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1791     $self->map_chars($matching_char, '-');
1792
1793     my @seqs = $self->each_seq();
1794     return 1 unless scalar @seqs > 1;
1795
1796     my $refseq = shift @seqs ;
1797     my @refseq = split //, $refseq->seq;
1798     my $gapchar = $self->gap_char;
1799
1800     foreach my $seq ( @seqs ) {
1801         my @varseq = split //, $seq->seq();
1802         for ( my $i=0; $i < scalar @varseq; $i++) {
1803             $varseq[$i] = $match if defined $refseq[$i] &&
1804                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1805                   $refseq[$i] =~ /$gapchar/ )
1806                       && $refseq[$i] eq $varseq[$i];
1807         }
1808         $seq->seq(join '', @varseq);
1809     }
1810     $self->match_char($match);
1811     return 1;
1812 }
1813
1814
1815 =head2 unmatch
1816
1817  Title     : unmatch()
1818  Usage     : $ali->unmatch()
1819  Function  : Undoes the effect of method match. Unsets match_char.
1820  Returns   : 1
1821  Argument  : a match character, optional, defaults to '.'
1822
1823 See L<match> and L<match_char>
1824
1825 =cut
1826
1827 sub unmatch {
1828     my ($self, $match) = @_;
1829
1830     $match ||= '.';
1831
1832     my @seqs = $self->each_seq();
1833     return 1 unless scalar @seqs > 1;
1834
1835     my $refseq = shift @seqs ;
1836     my @refseq = split //, $refseq->seq;
1837     my $gapchar = $self->gap_char;
1838     foreach my $seq ( @seqs ) {
1839         my @varseq = split //, $seq->seq();
1840         for ( my $i=0; $i < scalar @varseq; $i++) {
1841             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1842                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1843                   $refseq[$i] =~ /$gapchar/ ) &&
1844                       $varseq[$i] eq $match;
1845         }
1846         $seq->seq(join '', @varseq);
1847     }
1848     $self->match_char('');
1849     return 1;
1850 }
1851
1852 =head1 MSA attributes
1853
1854 Methods for setting and reading the MSA attributes.
1855
1856 Note that the methods defining character semantics depend on the user
1857 to set them sensibly.  They are needed only by certain input/output
1858 methods. Unset them by setting to an empty string ('').
1859
1860 =head2 id
1861
1862  Title     : id
1863  Usage     : $myalign->id("Ig")
1864  Function  : Gets/sets the id field of the alignment
1865  Returns   : An id string
1866  Argument  : An id string (optional)
1867
1868 =cut
1869
1870 sub id {
1871     my ($self, $name) = @_;
1872
1873     if (defined( $name )) {
1874         $self->{'_id'} = $name;
1875     }
1876
1877     return $self->{'_id'};
1878 }
1879
1880 =head2 accession
1881
1882  Title     : accession
1883  Usage     : $myalign->accession("PF00244")
1884  Function  : Gets/sets the accession field of the alignment
1885  Returns   : An acc string
1886  Argument  : An acc string (optional)
1887
1888 =cut
1889
1890 sub accession {
1891     my ($self, $acc) = @_;
1892
1893     if (defined( $acc )) {
1894         $self->{'_accession'} = $acc;
1895     }
1896
1897     return $self->{'_accession'};
1898 }
1899
1900 =head2 description
1901
1902  Title     : description
1903  Usage     : $myalign->description("14-3-3 proteins")
1904  Function  : Gets/sets the description field of the alignment
1905  Returns   : An description string
1906  Argument  : An description string (optional)
1907
1908 =cut
1909
1910 sub description {
1911     my ($self, $name) = @_;
1912
1913     if (defined( $name )) {
1914         $self->{'_description'} = $name;
1915     }
1916
1917     return $self->{'_description'};
1918 }
1919
1920 =head2 missing_char
1921
1922  Title     : missing_char
1923  Usage     : $myalign->missing_char("?")
1924  Function  : Gets/sets the missing_char attribute of the alignment
1925              It is generally recommended to set it to 'n' or 'N'
1926              for nucleotides and to 'X' for protein.
1927  Returns   : An missing_char string,
1928  Argument  : An missing_char string (optional)
1929
1930 =cut
1931
1932 sub missing_char {
1933     my ($self, $char) = @_;
1934
1935     if (defined $char ) {
1936         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1937         $self->{'_missing_char'} = $char;
1938     }
1939
1940     return $self->{'_missing_char'};
1941 }
1942
1943 =head2 match_char
1944
1945  Title     : match_char
1946  Usage     : $myalign->match_char('.')
1947  Function  : Gets/sets the match_char attribute of the alignment
1948  Returns   : An match_char string,
1949  Argument  : An match_char string (optional)
1950
1951 =cut
1952
1953 sub match_char {
1954     my ($self, $char) = @_;
1955
1956     if (defined $char ) {
1957         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1958         $self->{'_match_char'} = $char;
1959     }
1960
1961     return $self->{'_match_char'};
1962 }
1963
1964 =head2 gap_char
1965
1966  Title     : gap_char
1967  Usage     : $myalign->gap_char('-')
1968  Function  : Gets/sets the gap_char attribute of the alignment
1969  Returns   : An gap_char string, defaults to '-'
1970  Argument  : An gap_char string (optional)
1971
1972 =cut
1973
1974 sub gap_char {
1975     my ($self, $char) = @_;
1976
1977     if (defined $char || ! defined $self->{'_gap_char'} ) {
1978         $char= '-' unless defined $char;
1979         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1980         $self->{'_gap_char'} = $char;
1981     }
1982     return $self->{'_gap_char'};
1983 }
1984
1985 =head2 symbol_chars
1986
1987  Title   : symbol_chars
1988  Usage   : my @symbolchars = $aln->symbol_chars;
1989  Function: Returns all the seen symbols (other than gaps)
1990  Returns : array of characters that are the seen symbols
1991  Args    : boolean to include the gap/missing/match characters
1992
1993 =cut
1994
1995 sub symbol_chars{
1996    my ($self,$includeextra) = @_;
1997
1998    unless ($self->{'_symbols'}) {
1999        foreach my $seq ($self->each_seq) {
2000            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
2001        }
2002    }
2003    my %copy = %{$self->{'_symbols'}};
2004    if( ! $includeextra ) {
2005        foreach my $char ( $self->gap_char, $self->match_char,
2006                           $self->missing_char) {
2007            delete $copy{$char} if( defined $char );
2008        }
2009    }
2010    return keys %copy;
2011 }
2012
2013 =head1 Alignment descriptors
2014
2015 These read only methods describe the MSA in various ways.
2016
2017
2018 =head2 score
2019
2020  Title     : score
2021  Usage     : $str = $ali->score()
2022  Function  : get/set a score of the alignment
2023  Returns   : a score for the alignment
2024  Argument  : an optional score to set
2025
2026 =cut
2027
2028 sub score {
2029   my $self = shift;
2030   $self->{score} = shift if @_;
2031   return $self->{score};
2032 }
2033
2034 =head2 consensus_string
2035
2036  Title     : consensus_string
2037  Usage     : $str = $ali->consensus_string($threshold_percent)
2038  Function  : Makes a strict consensus
2039  Returns   : Consensus string
2040  Argument  : Optional threshold ranging from 0 to 100.
2041              The consensus residue has to appear at least threshold %
2042              of the sequences at a given location, otherwise a '?'
2043              character will be placed at that location.
2044              (Default value = 0%)
2045
2046 =cut
2047
2048 sub consensus_string {
2049     my $self = shift;
2050     my $threshold = shift;
2051
2052     my $out = "";
2053     my $len = $self->length - 1;
2054
2055     foreach ( 0 .. $len ) {
2056         $out .= $self->_consensus_aa($_,$threshold);
2057     }
2058     return $out;
2059 }
2060
2061 =head2 consensus_conservation
2062
2063  Title     : consensus_conservation
2064  Usage     : @conservation = $ali->consensus_conservation();
2065  Function  : Conservation (as a percent) of each position of alignment
2066  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2067  Argument  :
2068
2069 =cut
2070
2071 sub consensus_conservation {
2072     my $self = shift;
2073     my @cons;
2074     my $num_sequences = $self->num_sequences;
2075     foreach my $point (0..$self->length-1) {
2076         my %hash = $self->_consensus_counts($point);
2077         # max frequency of a non-gap letter
2078         my $max = (sort {$b<=>$a} values %hash )[0];
2079         push @cons, 100 * $max / $num_sequences;
2080     }
2081     return @cons;
2082 }
2083
2084 sub _consensus_aa {
2085     my $self = shift;
2086     my $point = shift;
2087     my $threshold_percent = shift || -1 ;
2088     my ($seq,%hash,$count,$letter,$key);
2089     my $gapchar = $self->gap_char;
2090     %hash = $self->_consensus_counts($point);
2091     my $number_of_sequences = $self->num_sequences();
2092     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2093     $count = -1;
2094     $letter = '?';
2095
2096     foreach $key ( sort keys %hash ) {
2097         # print "Now at $key $hash{$key}\n";
2098         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2099             $letter = $key;
2100             $count = $hash{$key};
2101         }
2102     }
2103     return $letter;
2104 }
2105
2106 # Frequency of each letter in one column
2107 sub _consensus_counts {
2108     my $self = shift;
2109     my $point = shift;
2110     my %hash;
2111     my $gapchar = $self->gap_char;
2112     foreach my $seq ( $self->each_seq() ) {
2113         my $letter = substr($seq->seq,$point,1);
2114         $self->throw("--$point-----------") if $letter eq '';
2115         ($letter eq $gapchar || $letter =~ /\./) && next;
2116         $hash{$letter}++;
2117     }
2118     return %hash;
2119 }
2120
2121
2122 =head2 consensus_iupac
2123
2124  Title     : consensus_iupac
2125  Usage     : $str = $ali->consensus_iupac()
2126  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2127              and RNA. The output is in upper case except when gaps in
2128              a column force output to be in lower case.
2129
2130              Note that if your alignment sequences contain a lot of
2131              IUPAC ambiquity codes you often have to manually set
2132              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2133              indicate a protein sequence.
2134  Returns   : consensus string
2135  Argument  : none
2136  Throws    : on protein sequences
2137
2138 =cut
2139
2140 sub consensus_iupac {
2141     my $self = shift;
2142     my $out = "";
2143     my $len = $self->length-1;
2144
2145     # only DNA and RNA sequences are valid
2146     foreach my $seq ( $self->each_seq() ) {
2147         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2148             if $seq->alphabet eq 'protein';
2149     }
2150     # loop over the alignment columns
2151     foreach my $count ( 0 .. $len ) {
2152         $out .= $self->_consensus_iupac($count);
2153     }
2154     return $out;
2155 }
2156
2157 sub _consensus_iupac {
2158     my ($self, $column) = @_;
2159     my ($string, $char, $rna);
2160
2161     #determine all residues in a column
2162     foreach my $seq ( $self->each_seq() ) {
2163         $string .= substr($seq->seq, $column, 1);
2164     }
2165     $string = uc $string;
2166
2167     # quick exit if there's an N in the string
2168     if ($string =~ /N/) {
2169         $string =~ /\W/ ? return 'n' : return 'N';
2170     }
2171     # ... or if there are only gap characters
2172     return '-' if $string =~ /^\W+$/;
2173
2174     # treat RNA as DNA in regexps
2175     if ($string =~ /U/) {
2176         $string =~ s/U/T/;
2177         $rna = 1;
2178     }
2179
2180     # the following s///'s only need to be done to the _first_ ambiguity code
2181     # as we only need to see the _range_ of characters in $string
2182
2183     if ($string =~ /[VDHB]/) {
2184         $string =~ s/V/AGC/;
2185         $string =~ s/D/AGT/;
2186         $string =~ s/H/ACT/;
2187         $string =~ s/B/CTG/;
2188     }
2189
2190     if ($string =~ /[SKYRWM]/) {
2191         $string =~ s/S/GC/;
2192         $string =~ s/K/GT/;
2193         $string =~ s/Y/CT/;
2194         $string =~ s/R/AG/;
2195         $string =~ s/W/AT/;
2196         $string =~ s/M/AC/;
2197     }
2198
2199     # and now the guts of the thing
2200
2201     if ($string =~ /A/) {
2202         $char = 'A';                     # A                      A
2203         if ($string =~ /G/) {
2204             $char = 'R';                 # A and G (purines)      R
2205             if ($string =~ /C/) {
2206                 $char = 'V';             # A and G and C          V
2207                 if ($string =~ /T/) {
2208                     $char = 'N';         # A and G and C and T    N
2209                 }
2210             } elsif ($string =~ /T/) {
2211                 $char = 'D';             # A and G and T          D
2212             }
2213         } elsif ($string =~ /C/) {
2214             $char = 'M';                 # A and C                M
2215             if ($string =~ /T/) {
2216                 $char = 'H';             # A and C and T          H
2217             }
2218         } elsif ($string =~ /T/) {
2219             $char = 'W';                 # A and T                W
2220         }
2221     } elsif ($string =~ /C/) {
2222         $char = 'C';                     # C                      C
2223         if ($string =~ /T/) {
2224             $char = 'Y';                 # C and T (pyrimidines)  Y
2225             if ($string =~ /G/) {
2226                 $char = 'B';             # C and T and G          B
2227             }
2228         } elsif ($string =~ /G/) {
2229             $char = 'S';                 # C and G                S
2230         }
2231     } elsif ($string =~ /G/) {
2232         $char = 'G';                     # G                      G
2233         if ($string =~ /C/) {
2234             $char = 'S';                 # G and C                S
2235         } elsif ($string =~ /T/) {
2236             $char = 'K';                 # G and T                K
2237         }
2238     } elsif ($string =~ /T/) {
2239         $char = 'T';                     # T                      T
2240     }
2241
2242     $char = 'U' if $rna and $char eq 'T';
2243     $char = lc $char if $string =~ /\W/;
2244
2245     return $char;
2246 }
2247
2248
2249 =head2 consensus_meta
2250
2251  Title     : consensus_meta
2252  Usage     : $seqmeta = $ali->consensus_meta()
2253  Function  : Returns a Bio::Seq::Meta object containing the consensus
2254              strings derived from meta data analysis.
2255  Returns   : Bio::Seq::Meta
2256  Argument  : Bio::Seq::Meta
2257  Throws    : non-MetaI object
2258
2259 =cut
2260
2261 sub consensus_meta {
2262     my ($self, $meta) = @_;
2263     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2264         $self->throw('Not a Bio::Seq::MetaI object');
2265     }
2266     return $self->{'_aln_meta'} = $meta if $meta;
2267     return $self->{'_aln_meta'}
2268 }
2269
2270 =head2 is_flush
2271
2272  Title     : is_flush
2273  Usage     : if ( $ali->is_flush() )
2274  Function  : Tells you whether the alignment
2275            : is flush, i.e. all of the same length
2276  Returns   : 1 or 0
2277  Argument  :
2278
2279 =cut
2280
2281 sub is_flush {
2282     my ($self,$report) = @_;
2283     my $seq;
2284     my $length = (-1);
2285     my $temp;
2286
2287     foreach $seq ( $self->each_seq() ) {
2288         if( $length == (-1) ) {
2289             $length = CORE::length($seq->seq());
2290             next;
2291         }
2292
2293         $temp = CORE::length($seq->seq());
2294         if( $temp != $length ) {
2295             $self->warn("expecting $length not $temp from ".
2296                         $seq->display_id) if( $report );
2297             $self->debug("expecting $length not $temp from ".
2298                          $seq->display_id);
2299             $self->debug($seq->seq(). "\n");
2300             return 0;
2301         }
2302     }
2303
2304     return 1;
2305 }
2306
2307
2308 =head2 length
2309
2310  Title     : length()
2311  Usage     : $len = $ali->length()
2312  Function  : Returns the maximum length of the alignment.
2313              To be sure the alignment is a block, use is_flush
2314  Returns   : Integer
2315  Argument  :
2316
2317 =cut
2318
2319 sub length_aln {
2320     my $self = shift;
2321     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2322     $self->length(@_);
2323 }
2324
2325 sub length {
2326     my $self = shift;
2327     my $seq;
2328     my $length = -1;
2329     my $temp;
2330
2331     foreach $seq ( $self->each_seq() ) {
2332         $temp = $seq->length();
2333         if( $temp > $length ) {
2334             $length = $temp;
2335         }
2336     }
2337
2338     return $length;
2339 }
2340
2341
2342 =head2 maxdisplayname_length
2343
2344  Title     : maxdisplayname_length
2345  Usage     : $ali->maxdisplayname_length()
2346  Function  : Gets the maximum length of the displayname in the
2347              alignment. Used in writing out various MSA formats.
2348  Returns   : integer
2349  Argument  :
2350
2351 =cut
2352
2353 sub maxname_length {
2354     my $self = shift;
2355     $self->deprecated("maxname_length - deprecated method.".
2356                       " Use maxdisplayname_length() instead.");
2357     $self->maxdisplayname_length();
2358 }
2359
2360 sub maxnse_length {
2361     my $self = shift;
2362     $self->deprecated("maxnse_length - deprecated method.".
2363                       " Use maxnse_length() instead.");
2364     $self->maxdisplayname_length();
2365 }
2366
2367 sub maxdisplayname_length {
2368     my $self = shift;
2369     my $maxname = (-1);
2370     my ($seq,$len);
2371
2372     foreach $seq ( $self->each_seq() ) {
2373         $len = CORE::length $self->displayname($seq->get_nse());
2374
2375         if( $len > $maxname ) {
2376             $maxname = $len;
2377         }
2378     }
2379
2380     return $maxname;
2381 }
2382
2383 =head2 max_metaname_length
2384
2385  Title     : max_metaname_length
2386  Usage     : $ali->max_metaname_length()
2387  Function  : Gets the maximum length of the meta name tags in the
2388              alignment for the sequences and for the alignment.
2389              Used in writing out various MSA formats.
2390  Returns   : integer
2391  Argument  : None
2392
2393 =cut
2394
2395 sub max_metaname_length {
2396     my $self = shift;
2397     my $maxname = (-1);
2398     my ($seq,$len);
2399
2400     # check seq meta first
2401     for $seq ( $self->each_seq() ) {
2402         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2403         for my $mtag ($seq->meta_names) {
2404             $len = CORE::length $mtag;
2405             if( $len > $maxname ) {
2406                 $maxname = $len;
2407             }
2408         }
2409     }
2410
2411     # alignment meta
2412     for my $meta ($self->consensus_meta) {
2413         next unless $meta;
2414         for my $name ($meta->meta_names) {
2415             $len = CORE::length $name;
2416             if( $len > $maxname ) {
2417                 $maxname = $len;
2418             }
2419         }
2420     }
2421
2422     return $maxname;
2423 }
2424
2425 =head2 num_residues
2426
2427  Title     : num_residues
2428  Usage     : $no = $ali->num_residues
2429  Function  : number of residues in total in the alignment
2430  Returns   : integer
2431  Argument  :
2432  Note      : replaces no_residues()
2433
2434 =cut
2435
2436 sub num_residues {
2437     my $self = shift;
2438     my $count = 0;
2439
2440     foreach my $seq ($self->each_seq) {
2441         my $str = $seq->seq();
2442
2443         $count += ($str =~ s/[A-Za-z]//g);
2444     }
2445
2446     return $count;
2447 }
2448
2449 =head2 num_sequences
2450
2451  Title     : num_sequences
2452  Usage     : $depth = $ali->num_sequences
2453  Function  : number of sequence in the sequence alignment
2454  Returns   : integer
2455  Argument  : none
2456  Note      : replaces no_sequences()
2457
2458 =cut
2459
2460 sub num_sequences {
2461     my $self = shift;
2462     return scalar($self->each_seq);
2463 }
2464
2465
2466 =head2 average_percentage_identity
2467
2468  Title   : average_percentage_identity
2469  Usage   : $id = $align->average_percentage_identity
2470  Function: The function uses a fast method to calculate the average
2471            percentage identity of the alignment
2472  Returns : The average percentage identity of the alignment
2473  Args    : None
2474  Notes   : This method implemented by Kevin Howe calculates a figure that is
2475            designed to be similar to the average pairwise identity of the
2476            alignment (identical in the absence of gaps), without having to
2477            explicitly calculate pairwise identities proposed by Richard Durbin.
2478            Validated by Ewan Birney ad Alex Bateman.
2479
2480 =cut
2481
2482 sub average_percentage_identity{
2483    my ($self,@args) = @_;
2484
2485    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2486                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2487
2488    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2489
2490    if (! $self->is_flush()) {
2491        $self->throw("All sequences in the alignment must be the same length");
2492    }
2493
2494    @seqs = $self->each_seq();
2495    $len = $self->length();
2496
2497    # load the each hash with correct keys for existence checks
2498
2499    for( my $index=0; $index < $len; $index++) {
2500        foreach my $letter (@alphabet) {
2501            $countHashes[$index]->{$letter} = 0;
2502        }
2503    }
2504    foreach my $seq (@seqs)  {
2505        my @seqChars = split //, $seq->seq();
2506        for( my $column=0; $column < @seqChars; $column++ ) {
2507            my $char = uc($seqChars[$column]);
2508            if (exists $countHashes[$column]->{$char}) {
2509                $countHashes[$column]->{$char}++;
2510            }
2511        }
2512    }
2513
2514    $total = 0;
2515    $divisor = 0;
2516    for(my $column =0; $column < $len; $column++) {
2517        my %hash = %{$countHashes[$column]};
2518        $subdivisor = 0;
2519        foreach my $res (keys %hash) {
2520            $total += $hash{$res}*($hash{$res} - 1);
2521            $subdivisor += $hash{$res};
2522        }
2523        $divisor += $subdivisor * ($subdivisor - 1);
2524    }
2525    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2526 }
2527
2528 =head2 percentage_identity
2529
2530  Title   : percentage_identity
2531  Usage   : $id = $align->percentage_identity
2532  Function: The function calculates the average percentage identity
2533            (aliased to average_percentage_identity)
2534  Returns : The average percentage identity
2535  Args    : None
2536
2537 =cut
2538
2539 sub percentage_identity {
2540     my $self = shift;
2541     return $self->average_percentage_identity();
2542 }
2543
2544 =head2 overall_percentage_identity
2545
2546  Title   : overall_percentage_identity
2547  Usage   : $id = $align->overall_percentage_identity
2548            $id = $align->overall_percentage_identity('short')
2549  Function: The function calculates the percentage identity of
2550            the conserved columns
2551  Returns : The percentage identity of the conserved columns
2552  Args    : length value to use, optional defaults to alignment length
2553                  possible values: 'align', 'short', 'long'
2554
2555 The argument values 'short' and 'long' refer to shortest and longest
2556 sequence in the alignment. Method modification code by Hongyu Zhang.
2557
2558 =cut
2559
2560 sub overall_percentage_identity{
2561    my ($self, $length_measure) = @_;
2562
2563    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2564
2565    my %enum = map {$_ => undef} qw (align short long);
2566
2567    $self->throw("Unknown argument [$length_measure]")
2568        if $length_measure and not exists $enum{$length_measure};
2569    $length_measure ||= 'align';
2570
2571    if (! $self->is_flush()) {
2572        $self->throw("All sequences in the alignment must be the same length");
2573    }
2574
2575    # Count the residues seen at each position
2576    my $len;
2577    my $total = 0; # number of positions with identical residues
2578    my @countHashes;
2579    my @seqs = $self->each_seq;
2580    my $nof_seqs = scalar @seqs;
2581    my $aln_len = $self->length();
2582    for my $seq (@seqs)  {
2583        my $seqstr = $seq->seq;
2584
2585        # Count residues for given sequence
2586        for my $column (0 .. $aln_len-1) {
2587            my $char = uc( substr($seqstr, $column, 1) );
2588            if ( exists $alphabet{$char} ) {
2589
2590                # This is a valid char
2591                if ( defined $countHashes[$column]->{$char} ) {
2592                  $countHashes[$column]->{$char}++;
2593                } else {
2594                  $countHashes[$column]->{$char} = 1;
2595                }
2596
2597                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2598                    # All sequences have this same residue
2599                    $total++;
2600                }
2601
2602            }
2603        }
2604
2605        # Sequence length
2606        if ($length_measure eq 'short' || $length_measure eq 'long') {
2607            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2608            if ($length_measure eq 'short') {
2609                if ( (not defined $len) || ($seq_len < $len) ) {
2610                    $len = $seq_len;
2611                }
2612            } elsif ($length_measure eq 'long') {
2613                if ( (not defined $len) || ($seq_len > $len) ) {
2614                    $len = $seq_len;
2615                }
2616            }
2617        }
2618
2619    }
2620
2621    if ($length_measure eq 'align') {
2622        $len = $aln_len;
2623    }
2624
2625    return ($total / $len ) * 100.0;
2626 }
2627
2628
2629
2630 =head1 Alignment positions
2631
2632 Methods to map a sequence position into an alignment column and back.
2633 column_from_residue_number() does the former. The latter is really a
2634 property of the sequence object and can done using
2635 L<Bio::LocatableSeq::location_from_column>:
2636
2637     # select somehow a sequence from the alignment, e.g.
2638     my $seq = $aln->get_seq_by_pos(1);
2639     #$loc is undef or Bio::LocationI object
2640     my $loc = $seq->location_from_column(5);
2641
2642 =head2 column_from_residue_number
2643
2644  Title   : column_from_residue_number
2645  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2646  Function: This function gives the position in the alignment
2647            (i.e. column number) of the given residue number in the
2648            sequence with the given name. For example, for the
2649            alignment
2650
2651              Seq1/91-97 AC..DEF.GH.
2652              Seq2/24-30 ACGG.RTY...
2653                 Seq3/43-51 AC.DDEF.GHI
2654
2655            column_from_residue_number( "Seq1", 94 ) returns 6.
2656            column_from_residue_number( "Seq2", 25 ) returns 2.
2657            column_from_residue_number( "Seq3", 50 ) returns 10.
2658
2659            An exception is thrown if the residue number would lie
2660            outside the length of the aligment
2661            (e.g. column_from_residue_number( "Seq2", 22 )
2662
2663           Note: If the the parent sequence is represented by more than
2664                 one alignment sequence and the residue number is present in
2665                 them, this method finds only the first one.
2666
2667  Returns : A column number for the position in the alignment of the
2668            given residue in the given sequence (1 = first column)
2669  Args    : A sequence id/name (not a name/start-end)
2670            A residue number in the whole sequence (not just that
2671            segment of it in the alignment)
2672
2673 =cut
2674
2675 sub column_from_residue_number {
2676     my ($self, $name, $resnumber) = @_;
2677
2678     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2679     $self->throw("Second argument residue number missing") unless $resnumber;
2680
2681     foreach my $seq ($self->each_seq_with_id($name)) {
2682         my $col;
2683         eval {
2684             $col = $seq->column_from_residue_number($resnumber);
2685         };
2686         next if $@;
2687         return $col;
2688     }
2689
2690     $self->throw("Could not find a sequence segment in $name ".
2691                  "containing residue number $resnumber");
2692
2693 }
2694
2695 =head1 Sequence names
2696
2697 Methods to manipulate the display name. The default name based on the
2698 sequence id and subsequence positions can be overridden in various
2699 ways.
2700
2701 =head2 displayname
2702
2703  Title     : displayname
2704  Usage     : $myalign->displayname("Ig", "IgA")
2705  Function  : Gets/sets the display name of a sequence in the alignment
2706  Returns   : A display name string
2707  Argument  : name of the sequence
2708              displayname of the sequence (optional)
2709
2710 =cut
2711
2712 sub get_displayname {
2713     my $self = shift;
2714     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2715     $self->displayname(@_);
2716 }
2717
2718 sub set_displayname {
2719     my $self = shift;
2720     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2721     $self->displayname(@_);
2722 }
2723
2724 sub displayname {
2725     my ($self, $name, $disname) = @_;
2726
2727     $self->throw("No sequence with name [$name]")
2728         unless defined $self->{'_seq'}->{$name};
2729
2730     if(  $disname and  $name) {
2731         $self->{'_dis_name'}->{$name} = $disname;
2732         return $disname;
2733     }
2734     elsif( defined $self->{'_dis_name'}->{$name} ) {
2735         return  $self->{'_dis_name'}->{$name};
2736     } else {
2737         return $name;
2738     }
2739 }
2740
2741 =head2 set_displayname_count
2742
2743  Title     : set_displayname_count
2744  Usage     : $ali->set_displayname_count
2745  Function  : Sets the names to be name_# where # is the number of
2746              times this name has been used.
2747  Returns   : 1, on success
2748  Argument  :
2749
2750 =cut
2751
2752 sub set_displayname_count {
2753     my $self= shift;
2754     my (@arr,$name,$seq,$count,$temp,$nse);
2755
2756     foreach $seq ( $self->each_alphabetically() ) {
2757         $nse = $seq->get_nse();
2758
2759         #name will be set when this is the second
2760         #time (or greater) is has been seen
2761
2762         if( defined $name and $name eq ($seq->id()) ) {
2763             $temp = sprintf("%s_%s",$name,$count);
2764             $self->displayname($nse,$temp);
2765             $count++;
2766         } else {
2767             $count = 1;
2768             $name = $seq->id();
2769             $temp = sprintf("%s_%s",$name,$count);
2770             $self->displayname($nse,$temp);
2771             $count++;
2772         }
2773     }
2774     return 1;
2775 }
2776
2777 =head2 set_displayname_flat
2778
2779  Title     : set_displayname_flat
2780  Usage     : $ali->set_displayname_flat()
2781  Function  : Makes all the sequences be displayed as just their name,
2782              not name/start-end
2783  Returns   : 1
2784  Argument  :
2785
2786 =cut
2787
2788 sub set_displayname_flat {
2789     my $self = shift;
2790     my ($nse,$seq);
2791
2792     foreach $seq ( $self->each_seq() ) {
2793         $nse = $seq->get_nse();
2794         $self->displayname($nse,$seq->id());
2795     }
2796     return 1;
2797 }
2798
2799 =head2 set_displayname_normal
2800
2801  Title     : set_displayname_normal
2802  Usage     : $ali->set_displayname_normal()
2803  Function  : Makes all the sequences be displayed as name/start-end
2804  Returns   : 1, on success
2805  Argument  :
2806
2807 =cut
2808
2809 sub set_displayname_normal {
2810     my $self = shift;
2811     my ($nse,$seq);
2812
2813     foreach $seq ( $self->each_seq() ) {
2814         $nse = $seq->get_nse();
2815         $self->displayname($nse,$nse);
2816     }
2817     return 1;
2818 }
2819
2820 =head2 source
2821
2822  Title   : source
2823  Usage   : $obj->source($newval)
2824  Function: sets the Alignment source program
2825  Example :
2826  Returns : value of source
2827  Args    : newvalue (optional)
2828
2829
2830 =cut
2831
2832 sub source{
2833    my ($self,$value) = @_;
2834    if( defined $value) {
2835       $self->{'_source'} = $value;
2836     }
2837     return $self->{'_source'};
2838 }
2839
2840 =head2 set_displayname_safe
2841
2842  Title     : set_displayname_safe
2843  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2844  Function  : Assign machine-generated serial names to sequences in input order.
2845              Designed to protect names during PHYLIP runs. Assign 10-char string
2846              in the form of "S000000001" to "S999999999". Restore the original
2847              names using "restore_displayname".
2848  Returns   : 1. a new $aln with system names;
2849              2. a hash ref for restoring names
2850  Argument  : Number for id length (default 10)
2851
2852 =cut
2853
2854 sub set_displayname_safe {
2855     my $self = shift;
2856     my $idlength = shift || 10;
2857     my ($seq, %phylip_name);
2858     my $ct=0;
2859     my $new=Bio::SimpleAlign->new();
2860     foreach $seq ( $self->each_seq() ) {
2861         $ct++;
2862         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2863         $phylip_name{$pname}=$seq->id();
2864         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2865                                             -seq      => $seq->seq(),
2866                                             -alphabet => $seq->alphabet,
2867                                             -start    => $seq->{_start},
2868                                             -end      => $seq->{_end}
2869                                             );
2870         $new->add_seq($new_seq);
2871     }
2872
2873     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2874     return ($new, \%phylip_name);
2875 }
2876
2877 =head2 restore_displayname
2878
2879  Title     : restore_displayname
2880  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2881  Function  : Restore original sequence names (after running
2882              $ali->set_displayname_safe)
2883  Returns   : a new $aln with names restored.
2884  Argument  : a hash reference of names from "set_displayname_safe".
2885
2886 =cut
2887
2888 sub restore_displayname {
2889     my $self = shift;
2890     my $ref=shift;
2891     my %name=%$ref;
2892     my $new=Bio::SimpleAlign->new();
2893     foreach my $seq ( $self->each_seq() ) {
2894       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2895       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2896                                           -seq      => $seq->seq(),
2897                                           -alphabet => $seq->alphabet,
2898                                           -start    => $seq->{_start},
2899                                           -end      => $seq->{_end}
2900                                           );
2901       $new->add_seq($new_seq);
2902     }
2903     return $new;
2904 }
2905
2906 =head2 sort_by_start
2907
2908  Title     : sort_by_start
2909  Usage     : $ali->sort_by_start
2910  Function  : Changes the order of the alignment to the start position of each
2911              subalignment
2912  Returns   :
2913  Argument  :
2914
2915 =cut
2916
2917 sub sort_by_start {
2918     my $self = shift;
2919     my ($seq,$nse,@arr,%hash,$count);
2920     foreach $seq ( $self->each_seq() ) {
2921         $nse = $seq->get_nse;
2922         $hash{$nse} = $seq;
2923     }
2924     $count = 0;
2925     %{$self->{'_order'}} = (); # reset the hash;
2926     foreach $nse ( sort _startend keys %hash) {
2927         $self->{'_order'}->{$count} = $nse;
2928         $count++;
2929     }
2930     1;
2931 }
2932
2933 sub _startend
2934 {
2935     my ($aname,$arange) = split (/[\/]/,$a);
2936     my ($bname,$brange) = split (/[\/]/,$b);
2937     my ($astart,$aend) = split(/\-/,$arange);
2938     my ($bstart,$bend) = split(/\-/,$brange);
2939     return $astart <=> $bstart;
2940 }
2941
2942 =head2 bracket_string
2943
2944  Title     : bracket_string
2945  Usage     : my @params = (-refseq     => 'testseq',
2946                            -allele1    => 'allele1',
2947                            -allele2    => 'allele2',
2948                            -delimiters => '{}',
2949                            -separator  => '/');
2950              $str = $aln->bracket_string(@params)
2951
2952  Function :  When supplied with a list of parameters (see below), returns a
2953              string in BIC format. This is used for allelic comparisons.
2954              Briefly, if either allele contains a base change when compared to
2955              the refseq, the base or gap for each allele is represented in
2956              brackets in the order present in the 'alleles' parameter.
2957
2958              For the following data:
2959
2960              >testseq
2961              GGATCCATTGCTACT
2962              >allele1
2963              GGATCCATTCCTACT
2964              >allele2
2965              GGAT--ATTCCTCCT
2966
2967              the returned string with parameters 'refseq => testseq' and
2968              'alleles => [qw(allele1 allele2)]' would be:
2969
2970              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2971  Returns   : BIC-formatted string
2972  Argument  : Required args
2973                 refseq    : string (ID) of the reference sequence used
2974                             as basis for comparison
2975                 allele1   : string (ID) of the first allele
2976                 allele2   : string (ID) of the second allele
2977              Optional args
2978                 delimiters: two symbol string of left and right delimiters.
2979                             Only the first two symbols are used
2980                             default = '[]'
2981                 separator : string used as a separator.  Only the first
2982                             symbol is used
2983                             default = '/'
2984  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2985
2986 =cut
2987
2988 sub bracket_string {
2989     my ($self, @args) = @_;
2990     my ($ref, $a1, $a2, $delim, $sep) =
2991         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2992     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2993     my ($ld, $rd);
2994     ($ld, $rd) = split('', $delim, 2) if $delim;
2995     $ld ||= '[';
2996     $rd ||= ']';
2997     $sep ||= '/';
2998     my ($refseq, $allele1, $allele2) =
2999         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
3000     if (!$refseq || !$allele1 || !$allele2) {
3001         $self->throw("One of your refseq/allele IDs is invalid!");
3002     }
3003     my $len = $self->length-1;
3004     my $bic = '';
3005     # loop over the alignment columns
3006     for my $column ( 0 .. $len ) {
3007         my $string;
3008         my ($compres, $res1, $res2) =
3009             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3010         # are any of the allele symbols different from the refseq?
3011         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3012                 $ld.$res1.$sep.$res2.$rd;
3013         $bic .= $string;
3014     }
3015     return $bic;
3016 }
3017
3018
3019 =head2 methods implementing Bio::FeatureHolderI
3020
3021 FeatureHolderI implementation to support labeled character sets like one
3022 would get from NEXUS represented data.
3023
3024 =head2 get_SeqFeatures
3025
3026  Usage   : @features = $aln->get_SeqFeatures
3027  Function: Get the feature objects held by this feature holder.
3028  Example :
3029  Returns : an array of Bio::SeqFeatureI implementing objects
3030  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3031          : as argument, returning TRUE if wanted, FALSE if
3032          : unwanted
3033
3034 =cut
3035
3036 sub get_SeqFeatures {
3037     my $self = shift;
3038     my $filter_cb = shift;
3039     $self->throw("Arg (filter callback) must be a coderef") unless
3040         !defined($filter_cb) or ref($filter_cb) eq 'CODE';
3041     if( !defined $self->{'_as_feat'} ) {
3042         $self->{'_as_feat'} = [];
3043     }
3044     if ($filter_cb) {
3045         return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
3046     }
3047     return @{$self->{'_as_feat'}};
3048 }
3049
3050 =head2 add_SeqFeature
3051
3052  Usage   : $aln->add_SeqFeature($subfeat);
3053  Function: adds a SeqFeature into the SeqFeature array.
3054  Example :
3055  Returns : true on success
3056  Args    : a Bio::SeqFeatureI object
3057  Note    : This implementation is not compliant
3058            with Bio::FeatureHolderI
3059
3060 =cut
3061
3062 sub add_SeqFeature {
3063    my ($self,@feat) = @_;
3064
3065    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3066
3067    foreach my $feat ( @feat ) {
3068        if( !$feat->isa("Bio::SeqFeatureI") ) {
3069            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3070        }
3071
3072        push(@{$self->{'_as_feat'}},$feat);
3073    }
3074    return 1;
3075 }
3076
3077
3078 =head2 remove_SeqFeatures
3079
3080  Usage   : $obj->remove_SeqFeatures
3081  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3082            remove that subset from the returned array, and add back the rest.
3083  Returns : The array of Bio::SeqFeatureI features that was
3084            deleted from this alignment.
3085  Args    : none
3086
3087 =cut
3088
3089 sub remove_SeqFeatures {
3090     my $self = shift;
3091
3092     return () unless $self->{'_as_feat'};
3093     my @feats = @{$self->{'_as_feat'}};
3094     $self->{'_as_feat'} = [];
3095     return @feats;
3096 }
3097
3098 =head2 feature_count
3099
3100  Title   : feature_count
3101  Usage   : $obj->feature_count()
3102  Function: Return the number of SeqFeatures attached to the alignment
3103  Returns : integer representing the number of SeqFeatures
3104  Args    : None
3105
3106 =cut
3107
3108 sub feature_count {
3109     my ($self) = @_;
3110
3111     if (defined($self->{'_as_feat'})) {
3112         return ($#{$self->{'_as_feat'}} + 1);
3113     } else {
3114         return 0;
3115     }
3116 }
3117
3118 =head2 get_all_SeqFeatures
3119
3120  Title   : get_all_SeqFeatures
3121  Usage   :
3122  Function: Get all SeqFeatures.
3123  Example :
3124  Returns : an array of Bio::SeqFeatureI implementing objects
3125  Args    : none
3126  Note    : Falls through to Bio::FeatureHolderI implementation.
3127
3128 =cut
3129
3130 =head2 methods for Bio::AnnotatableI
3131
3132 AnnotatableI implementation to support sequence alignments which
3133 contain annotation (NEXUS, Stockholm).
3134
3135 =head2 annotation
3136
3137  Title   : annotation
3138  Usage   : $ann = $aln->annotation or
3139            $aln->annotation($ann)
3140  Function: Gets or sets the annotation
3141  Returns : Bio::AnnotationCollectionI object
3142  Args    : None or Bio::AnnotationCollectionI object
3143
3144 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3145 for more information
3146
3147 =cut
3148
3149 sub annotation {
3150     my ($obj,$value) = @_;
3151     if( defined $value ) {
3152         $obj->throw("object of class ".ref($value)." does not implement ".
3153                 "Bio::AnnotationCollectionI. Too bad.")
3154             unless $value->isa("Bio::AnnotationCollectionI");
3155         $obj->{'_annotation'} = $value;
3156     } elsif( ! defined $obj->{'_annotation'}) {
3157         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3158     }
3159     return $obj->{'_annotation'};
3160 }
3161
3162 =head1 Deprecated methods
3163
3164 =cut
3165
3166 =head2 no_residues
3167
3168  Title     : no_residues
3169  Usage     : $no = $ali->no_residues
3170  Function  : number of residues in total in the alignment
3171  Returns   : integer
3172  Argument  :
3173  Note      : deprecated in favor of num_residues()
3174
3175 =cut
3176
3177 sub no_residues {
3178         my $self = shift;
3179         $self->deprecated(-warn_version => 1.0069,
3180                                           -throw_version => 1.0075,
3181                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3182     $self->num_residues(@_);
3183 }
3184
3185 =head2 no_sequences
3186
3187  Title     : no_sequences
3188  Usage     : $depth = $ali->no_sequences
3189  Function  : number of sequence in the sequence alignment
3190  Returns   : integer
3191  Argument  :
3192  Note      : deprecated in favor of num_sequences()
3193
3194 =cut
3195
3196 sub no_sequences {
3197         my $self = shift;
3198         $self->deprecated(-warn_version => 1.0069,
3199                                           -throw_version => 1.0075,
3200                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3201     $self->num_sequences(@_);
3202 }
3203
3204 =head2 mask_columns
3205
3206  Title     : mask_columns
3207  Usage     : $aln2 = $aln->mask_columns(20,30)
3208  Function  : Masks a slice of the alignment inclusive of start and
3209              end columns, and the first column in the alignment is denoted 1.
3210              Mask beyond the length of the sequence does not do padding.
3211  Returns   : A Bio::SimpleAlign object
3212  Args      : Positive integer for start column, positive integer for end column,
3213              optional string value use for the mask. Example:
3214
3215              $aln2 = $aln->mask_columns(20,30,'?')
3216  Note      : Masking must use a character that is not used for gaps or
3217              frameshifts.  These can be adjusted using the relevant global
3218              variables, but be aware these may be (uncontrollably) modified
3219              elsewhere within BioPerl (see bug 2715)
3220
3221 =cut
3222
3223 sub mask_columns {
3224     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3225     my $self = shift;
3226
3227     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3228              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3229
3230     # coordinates are alignment-based, not sequence-based
3231     my ($start, $end, $mask_char) = @_;
3232     unless (defined $mask_char) { $mask_char = 'N' }
3233
3234     $self->throw("Mask start has to be a positive integer and less than ".
3235                  "alignment length, not [$start]")
3236       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3237     $self->throw("Mask end has to be a positive integer and less than ".
3238                  "alignment length, not [$end]")
3239       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3240     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3241                  "end [$end]") unless $start <= $end;
3242     $self->throw("Mask character $mask_char has to be a single character ".
3243                  "and not a gap or frameshift symbol")
3244       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3245
3246     my $aln = $self->new;
3247     $aln->id($self->id);
3248     foreach my $seq ( $self->each_seq() ) {
3249         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3250          -alphabet => $seq->alphabet,
3251          -strand  => $seq->strand,
3252          -verbose => $self->verbose);
3253
3254         # convert from 1-based alignment coords!
3255         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3256         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3257         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3258         $new_seq->seq($new_dna_string);
3259         $aln->add_seq($new_seq);
3260     }
3261     return $aln;
3262 }
3263
3264
3265
3266 1;