Bio/SimpleAlign.pm

   1 # BioPerl module for SimpleAlign
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12 #
  13 #  History:
  14 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  15 #       May 2001 major rewrite - Heikki Lehvaslaiho
  16
  17 =head1 NAME
  18
  19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  20
  21 =head1 SYNOPSIS
  22
  23   # Use Bio::AlignIO to read in the alignment
  24   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  25   $aln = $str->next_aln();
  26
  27   # Describe
  28   print $aln->length;
  29   print $aln->num_residues;
  30   print $aln->is_flush;
  31   print $aln->num_sequences;
  32   print $aln->score;
  33   print $aln->percentage_identity;
  34   print $aln->consensus_string(50);
  35
  36   # Find the position in the alignment for a sequence location
  37   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  38
  39   # Extract sequences and check values for the alignment column $pos
  40   foreach $seq ($aln->each_seq) {
  41       $res = $seq->subseq($pos, $pos);
  42       $count{$res}++;
  43   }
  44   foreach $res (keys %count) {
  45       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  46   }
  47
  48   # Manipulate
  49   $aln->remove_seq($seq);
  50   $mini_aln = $aln->slice(20,30);  # get a block of columns
  51   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  52   $new_aln = $aln->remove_columns([20,30]); # remove by position
  53   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  54
  55   # Analyze
  56   $str = $aln->consensus_string($threshold_percent);
  57   $str = $aln->match_line();
  58   $str = $aln->cigar_line();
  59   $id = $aln->percentage_identity;
  60
  61   # See the module documentation for details and more methods.
  62
  63 =head1 DESCRIPTION
  64
  65 SimpleAlign is an object that handles a multiple sequence alignment
  66 (MSA). It is very permissive of types (it does not insist on sequences
  67 being all same length, for example). Think of it as a set of sequences
  68 with a whole series of built-in manipulations and methods for reading and
  69 writing alignments.
  70
  71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  72 to store its sequences. These are subsequences with a start and end
  73 positions in the parent reference sequence. Each sequence in the
  74 SimpleAlign object is a Bio::LocatableSeq.
  75
  76 SimpleAlign expects the combination of name, start, and end for a
  77 given sequence to be unique in the alignment, and this is the key for the
  78 internal hashes (name, start, end are abbreviated C<nse> in the code).
  79 However, in some cases people do not want the name/start-end to be displayed:
  80 either multiple names in an alignment or names specific to the alignment
  81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  82 C<displayname>, and generally is what is used to print out the
  83 alignment. They default to name/start-end.
  84
  85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules.  Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 I<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution. Bug reports can be submitted via the
 113 web:
 114
 115   https://redmine.open-bio.org/projects/bioperl/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@ebi.ac.uk
 120
 121 =head1 CONTRIBUTORS
 122
 123 Allen Day, allenday-at-ucla.edu,
 124 Richard Adams, Richard.Adams-at-ed.ac.uk,
 125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 127 Allen Smith, allens-at-cpan.org,
 128 Jason Stajich, jason-at-bioperl.org,
 129 Anthony Underwood, aunderwood-at-phls.org.uk,
 130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 131 Brian Osborne, bosborne at alum.mit.edu
 132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 133 Hongyu Zhang, forward at hongyu.org
 134 Jay Hannah, jay at jays.net
 135 Alexandr Bezginov, albezg at gmail.com
 136
 137 =head1 SEE ALSO
 138
 139 L<Bio::LocatableSeq>
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148 # 'Let the code begin...
 149
 150 package Bio::SimpleAlign;
 151 use vars qw(%CONSERVATION_GROUPS);
 152 use strict;
 153
 154 use Bio::LocatableSeq;  # uses Seq's as list
 155
 156 use Bio::Seq;
 157 use Bio::SeqFeature::Generic;
 158
 159 BEGIN {
 160     # This data should probably be in a more centralized module...
 161     # it is taken from Clustalw documentation.
 162     # These are all the positively scoring groups that occur in the
 163     # Gonnet Pam250 matrix. The strong and weak groups are
 164     # defined as strong score >0.5 and weak score =<0.5 respectively.
 165
 166     %CONSERVATION_GROUPS = (
 167             'strong' => [ qw(
 168                                                  STA
 169                                                  NEQK
 170                                                  NHQK
 171                                                  NDEQ
 172                                                  QHRK
 173                                                  MILV
 174                                                  MILF
 175                                                  HY
 176                                                  FYW )],
 177                                 'weak' => [ qw(
 178                       CSA
 179                                                ATV
 180                                                SAG
 181                                                STNK
 182                                                STPA
 183                                                SGND
 184                                                SNDEQK
 185                                                NDEQHK
 186                                                NEQHRK
 187                                                FVLIM
 188                                                HFY )],);
 189 }
 190
 191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 192             Bio::FeatureHolderI);
 193
 194 =head2 new
 195
 196  Title     : new
 197  Usage     : my $aln = Bio::SimpleAlign->new();
 198  Function  : Creates a new simple align object
 199  Returns   : Bio::SimpleAlign
 200  Args      : -source     => string representing the source program
 201                             where this alignment came from
 202              -annotation => Bio::AnnotationCollectionI
 203              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 204              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 205              -consensus  => consensus string
 206              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 207
 208 =cut
 209
 210
 211 sub new {
 212   my($class,@args) = @_;
 213
 214   my $self = $class->SUPER::new(@args);
 215
 216   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 217                                             SOURCE
 218                                             SCORE
 219                                             ID
 220                                             ACCESSION
 221                                             DESCRIPTION
 222                                             SEQS
 223                                             FEATURES
 224                                             ANNOTATION
 225                                             SEQ_ANNOTATION
 226                                             CONSENSUS
 227                                             CONSENSUS_META
 228                                             )], @args);
 229   $src && $self->source($src);
 230   defined $score && $self->score($score);
 231   # we need to set up internal hashs first!
 232
 233   $self->{'_seq'} = {};
 234   $self->{'_order'} = {};
 235   $self->{'_start_end_lists'} = {};
 236   $self->{'_dis_name'} = {};
 237   $self->{'_id'} = 'NoName';
 238   # maybe we should automatically read in from args. Hmmm...
 239   $id  && $self->id($id);
 240   $acc && $self->accession($acc);
 241   $desc && $self->description($desc);
 242   $coll && $self->annotation($coll);
 243   # sequence annotation is layered into a provided annotation collection (or dies)
 244   if ($sa) {
 245     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 246                  "with a sequence annotation collection")
 247         if !$coll;
 248     $coll->add_Annotation('seq_annotation', $sa);
 249   }
 250   if ($feats && ref $feats eq 'ARRAY') {
 251     for my $feat (@$feats) {
 252         $self->add_SeqFeature($feat);
 253     }
 254   }
 255   $con && $self->consensus($con);
 256   $cmeta && $self->consensus_meta($cmeta);
 257   # assumes these are in correct alignment order
 258   if ($seqs && ref($seqs) eq 'ARRAY') {
 259     for my $seq (@$seqs) {
 260         $self->add_seq($seq);
 261     }
 262   }
 263
 264   return $self; # success - we hope!
 265 }
 266
 267 =head1 Modifier methods
 268
 269 These methods modify the MSA by adding, removing or shuffling complete
 270 sequences.
 271
 272 =head2 add_seq
 273
 274  Title     : add_seq
 275  Usage     : $myalign->add_seq($newseq);
 276              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 277  Function  : Adds another sequence to the alignment. *Does not* align
 278              it - just adds it to the hashes.
 279              If -ORDER is specified, the sequence is inserted at the
 280              the position spec'd by -ORDER, and existing sequences
 281              are pushed down the storage array.
 282  Returns   : nothing
 283  Args      : A Bio::LocatableSeq object
 284              Positive integer for the sequence position (optional)
 285
 286 See L<Bio::LocatableSeq> for more information
 287
 288 =cut
 289
 290 sub addSeq {
 291     my $self = shift;
 292     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 293     $self->add_seq(@_);
 294 }
 295
 296 sub add_seq {
 297     my $self = shift;
 298     my @args = @_;
 299     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 300     my ($name,$id,$start,$end);
 301
 302     unless ($seq) {
 303         $self->throw("LocatableSeq argument required");
 304     }
 305     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 306         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 307     }
 308     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 309     $order--; # jay's patch (user-specified order is 1-origin)
 310
 311     if ($order < 0) {
 312         $self->throw("User-specified value for ORDER must be >= 1");
 313     }
 314
 315     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 316
 317     # build the symbol list for this sequence,
 318     # will prune out the gap and missing/match chars
 319     # when actually asked for the symbol list in the
 320     # symbol_chars
 321     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 322
 323     $name = $seq->get_nse;
 324
 325     if( $self->{'_seq'}->{$name} ) {
 326         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 327     }
 328     else {
 329         $self->debug( "Assigning $name to $order\n");
 330
 331     my $ordh = $self->{'_order'};
 332     if ($ordh->{$order}) {
 333         # make space to insert
 334         # $c->() returns (in reverse order) the first subsequence
 335         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 336         # (3,2,1), and $c->(2,4,5) returns (2).
 337         my $c;
 338         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 339         map {
 340      $ordh->{$_+1} = $ordh->{$_}
 341         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 342
 343     }
 344     $ordh->{$order} = $name;
 345
 346         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 347             $self->{'_start_end_lists'}->{$id} = [];
 348         }
 349         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 350     }
 351
 352     $self->{'_seq'}->{$name} = $seq;
 353
 354 }
 355
 356
 357 =head2 remove_seq
 358
 359  Title     : remove_seq
 360  Usage     : $aln->remove_seq($seq);
 361  Function  : Removes a single sequence from an alignment
 362  Returns   :
 363  Argument  : a Bio::LocatableSeq object
 364
 365 =cut
 366
 367 sub removeSeq {
 368     my $self = shift;
 369     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 370     $self->remove_seq(@_);
 371 }
 372
 373 sub remove_seq {
 374     my $self = shift;
 375     my $seq = shift;
 376     my ($name,$id);
 377
 378     $self->throw("Need Bio::Locatable seq argument ")
 379         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 380
 381     $id = $seq->id();
 382     $name = $seq->get_nse;
 383
 384     if( !exists $self->{'_seq'}->{$name} ) {
 385         $self->throw("Sequence $name does not exist in the alignment to remove!");
 386     }
 387
 388     delete $self->{'_seq'}->{$name};
 389
 390     # we need to remove this seq from the start_end_lists hash
 391
 392     if (exists $self->{'_start_end_lists'}->{$id}) {
 393         # we need to find the sequence in the array.
 394
 395         my ($i, $found);;
 396         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 397             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 398                 $found = 1;
 399                 last;
 400             }
 401         }
 402         if ($found) {
 403             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 404         }
 405         else {
 406             $self->throw("Could not find the sequence to remoce from the start-end list");
 407         }
 408     }
 409     else {
 410         $self->throw("There is no seq list for the name $id");
 411     }
 412     # we need to shift order hash
 413     my %rev_order = reverse %{$self->{'_order'}};
 414     my $no = $rev_order{$name};
 415     my $num_sequences = $self->num_sequences;
 416     for (; $no < $num_sequences; $no++) {
 417        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 418     }
 419     delete $self->{'_order'}->{$no};
 420     return 1;
 421 }
 422
 423
 424 =head2 purge
 425
 426  Title   : purge
 427  Usage   : $aln->purge(0.7);
 428  Function: Removes sequences above given sequence similarity
 429            This function will grind on large alignments. Beware!
 430  Example :
 431  Returns : An array of the removed sequences
 432  Args    : float, threshold for similarity
 433
 434 =cut
 435
 436 sub purge {
 437         my ($self,$perc) = @_;
 438         my (%duplicate, @dups);
 439
 440         my @seqs = $self->each_seq();
 441
 442         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 443                 my $seq = $seqs[$i];
 444
 445                 #skip if already in duplicate hash
 446                 next if exists $duplicate{$seq->display_id} ;
 447                 my $one = $seq->seq();
 448
 449                 my @one = split '', $one;       #split to get 1aa per array element
 450
 451                 for (my $j=$i+1;$j < @seqs;$j++) {
 452                         my $seq2 = $seqs[$j];
 453
 454                         #skip if already in duplicate hash
 455                         next if exists $duplicate{$seq2->display_id} ;
 456
 457                         my $two = $seq2->seq();
 458                         my @two = split '', $two;
 459
 460                         my $count = 0;
 461                         my $res = 0;
 462                         for (my $k=0;$k<@one;$k++) {
 463                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 464                                           $one[$k] eq $two[$k]) {
 465                                         $count++;
 466                                 }
 467                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 468                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 469                                         $res++;
 470                                 }
 471                         }
 472
 473                         my $ratio = 0;
 474                         $ratio = $count/$res unless $res == 0;
 475
 476                         # if above threshold put in duplicate hash and push onto
 477                         # duplicate array for returning to get_unique
 478                         if ( $ratio > $perc ) {
 479                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 480                                 $duplicate{$seq2->display_id} = 1;
 481                                 push @dups, $seq2;
 482                         }
 483                 }
 484         }
 485         foreach my $seq (@dups) {
 486                 $self->remove_seq($seq);
 487         }
 488         return @dups;
 489 }
 490
 491 =head2 sort_alphabetically
 492
 493  Title     : sort_alphabetically
 494  Usage     : $ali->sort_alphabetically
 495  Function  : Changes the order of the alignment to alphabetical on name
 496              followed by numerical by number.
 497  Returns   :
 498  Argument  :
 499
 500 =cut
 501
 502 sub sort_alphabetically {
 503     my $self = shift;
 504     my ($seq,$nse,@arr,%hash,$count);
 505
 506     foreach $seq ( $self->each_seq() ) {
 507         $nse = $seq->get_nse;
 508         $hash{$nse} = $seq;
 509     }
 510
 511     $count = 0;
 512
 513     %{$self->{'_order'}} = (); # reset the hash;
 514
 515     foreach $nse ( sort _alpha_startend keys %hash) {
 516         $self->{'_order'}->{$count} = $nse;
 517
 518         $count++;
 519     }
 520     1;
 521 }
 522
 523 =head2 sort_by_list
 524
 525  Title     : sort_by_list
 526  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 527  Function  : Arbitrarily order sequences in an alignment
 528  Returns   : A new Bio::SimpleAlign object
 529  Argument  : a file listing sequence names in intended order (one name per line)
 530
 531 =cut
 532
 533 sub sort_by_list {
 534     my ($self, $list) = @_;
 535     my (@seq, @ids, %order);
 536
 537     foreach my $seq ( $self->each_seq() ) {
 538         push @seq, $seq;
 539         push @ids, $seq->display_id;
 540     }
 541
 542     my $ct=1;
 543     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 544     while (<$listfh>) {
 545       chomp;
 546       my $name=$_;
 547       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 548       $order{$name}=$ct++;
 549     }
 550     close($listfh);
 551
 552     # use the map-sort-map idiom:
 553     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 554     my $aln = $self->new;
 555     foreach (@sorted) { $aln->add_seq($_) }
 556     return $aln;
 557 }
 558
 559 =head2 set_new_reference
 560
 561  Title     : set_new_reference
 562  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 563              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 564              as the reference (1st) sequence
 565  Function  : Change/Set a new reference (i.e., the first) sequence
 566  Returns   : a new Bio::SimpleAlign object.
 567              Throws an exception if designated sequence not found
 568  Argument  : a positive integer of sequence order, or a sequence name
 569              in the original alignment
 570
 571 =cut
 572
 573 sub set_new_reference {
 574     my ($self, $seqid) = @_;
 575     my $aln = $self->new;
 576     my (@seq, @ids, @new_seq);
 577     my $is_num=0;
 578     foreach my $seq ( $self->each_seq() ) {
 579         push @seq, $seq;
 580         push @ids, $seq->display_id;
 581     }
 582
 583     if ($seqid =~ /^\d+$/) { # argument is seq position
 584         $is_num=1;
 585         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 586     } else { # argument is a seq name
 587         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 588     }
 589
 590     for (my $i=0; $i<=$#seq; $i++) {
 591         my $pos=$i+1;
 592         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 593             unshift @new_seq, $seq[$i];
 594         } else {
 595             push @new_seq, $seq[$i];
 596         }
 597     }
 598     foreach (@new_seq) { $aln->add_seq($_);  }
 599     return $aln;
 600 }
 601
 602 sub _in_aln {  # check if input name exists in the alignment
 603     my ($str, $ref) = @_;
 604     foreach (@$ref) {
 605         return 1 if $str eq $_;
 606     }
 607     return 0;
 608 }
 609
 610
 611 =head2 uniq_seq
 612
 613  Title     : uniq_seq
 614  Usage     : $aln->uniq_seq():  Remove identical sequences in
 615              in the alignment.  Ambiguous base ("N", "n") and
 616              leading and ending gaps ("-") are NOT counted as
 617              differences.
 618  Function  : Make a new alignment of unique sequence types (STs)
 619  Returns   : 1a. if called in a scalar context,
 620                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 621              1b. if called in an array context,
 622                 a new Bio::SimpleAlign object, and a hashref whose keys
 623                 are sequence types, and whose values are arrayrefs to
 624                 lists of sequence ids within the corresponding sequence type
 625              2. if $aln->verbose > 0, ST of each sequence is sent to
 626                 STDERR (in a tabular format)
 627  Argument  : None
 628
 629 =cut
 630
 631 sub uniq_seq {
 632     my ($self, $seqid) = @_;
 633     my $aln = $self->new;
 634     my (%member, %order, @seq, @uniq_str, $st);
 635     my $order=0;
 636     my $len = $self->length();
 637     $st = {};
 638     foreach my $seq ( $self->each_seq() ) {
 639         my $str = $seq->seq();
 640
 641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 642 # comparing two sequence strings
 643
 644     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 645         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 646     # 2nd, convert leading and ending gaps to "?":
 647         $str = &_convert_leading_ending_gaps($str, '-', '?');
 648     # Note that '?' also can mean unknown residue.
 649     # I don't like making global class member changes like this, too
 650     # prone to errors... -- cjfields 08-11-18
 651     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 652         my $new = Bio::LocatableSeq->new(
 653                      -id      => $seq->id(),
 654                                          -alphabet=> $seq->alphabet,
 655                                          -seq     => $str,
 656                                          -start   => $seq->start,
 657                                          -end     => $seq->end
 658                                          );
 659         push @seq, $new;
 660     }
 661
 662     foreach my $seq (@seq) {
 663         my $str = $seq->seq();
 664         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 665         if ($seen) { # seen before
 666             my @memb = @{$member{$key}};
 667             push @memb, $seq;
 668             $member{$key} = \@memb;
 669         } else {  # not seen
 670             push @uniq_str, $key;
 671             $order++;
 672             $member{$key} = [ ($seq) ];
 673             $order{$key} = $order;
 674         }
 675     }
 676
 677     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 679         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 681         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 682         my $gap='-';
 683         my $end= CORE::length($str2);
 684         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 685         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 686                                          -seq  =>$str2,
 687                                          -start=>1,
 688                                          -end  =>$end
 689                                          );
 690         $aln->add_seq($new);
 691         foreach (@{$member{$str}}) {
 692             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 693         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 694         }
 695     }
 696     return wantarray ? ($aln, $st) : $aln;
 697 }
 698
 699 sub _check_uniq {  # check if same seq exists in the alignment
 700     my ($str1, $ref, $length) = @_;
 701     my @char1=split //, $str1;
 702     my @array=@$ref;
 703
 704     return (0, $str1) if @array==0; # not seen (1st sequence)
 705
 706     foreach my $str2 (@array) {
 707         my $diff=0;
 708         my @char2=split //, $str2;
 709         for (my $i=0; $i<=$length-1; $i++) {
 710             next if $char1[$i] eq '?';
 711             next if $char2[$i] eq '?';
 712             $diff++ if $char1[$i] ne $char2[$i];
 713         }
 714         return (1, $str2) if $diff == 0;  # seen before
 715     }
 716
 717     return (0, $str1); # not seen
 718 }
 719
 720 sub _convert_leading_ending_gaps {
 721     my $s=shift;
 722     my $sym1=shift;
 723     my $sym2=shift;
 724     my @array=split //, $s;
 725 # convert leading char:
 726     for (my $i=0; $i<=$#array; $i++) {
 727         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 728     }
 729 # convert ending char:
 730     for (my $i = $#array; $i>= 0; $i--) {
 731         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 732     }
 733     my $s_new=join '', @array;
 734     return $s_new;
 735 }
 736
 737 =head1 Sequence selection methods
 738
 739 Methods returning one or more sequences objects.
 740
 741 =head2 each_seq
 742
 743  Title     : each_seq
 744  Usage     : foreach $seq ( $align->each_seq() )
 745  Function  : Gets a Seq object from the alignment
 746  Returns   : Seq object
 747  Argument  :
 748
 749 =cut
 750
 751 sub eachSeq {
 752     my $self = shift;
 753     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 754     $self->each_seq();
 755 }
 756
 757 sub each_seq {
 758         my $self = shift;
 759         my (@arr,$order);
 760
 761         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 762                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 763                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 764                 }
 765         }
 766         return @arr;
 767 }
 768
 769
 770 =head2 each_alphabetically
 771
 772  Title     : each_alphabetically
 773  Usage     : foreach $seq ( $ali->each_alphabetically() )
 774  Function  : Returns a sequence object, but the objects are returned
 775              in alphabetically sorted order.
 776              Does not change the order of the alignment.
 777  Returns   : Seq object
 778  Argument  :
 779
 780 =cut
 781
 782 sub each_alphabetically {
 783         my $self = shift;
 784         my ($seq,$nse,@arr,%hash,$count);
 785
 786         foreach $seq ( $self->each_seq() ) {
 787                 $nse = $seq->get_nse;
 788                 $hash{$nse} = $seq;
 789         }
 790
 791         foreach $nse ( sort _alpha_startend keys %hash) {
 792                 push(@arr,$hash{$nse});
 793         }
 794         return @arr;
 795 }
 796
 797 sub _alpha_startend {
 798     my ($aname,$astart,$bname,$bstart);
 799     ($aname,$astart) = split (/-/,$a);
 800     ($bname,$bstart) = split (/-/,$b);
 801
 802     if( $aname eq $bname ) {
 803         return $astart <=> $bstart;
 804     }
 805     else {
 806         return $aname cmp $bname;
 807     }
 808 }
 809
 810 =head2 each_seq_with_id
 811
 812  Title     : each_seq_with_id
 813  Usage     : foreach $seq ( $align->each_seq_with_id() )
 814  Function  : Gets a Seq objects from the alignment, the contents
 815              being those sequences with the given name (there may be
 816              more than one)
 817  Returns   : Seq object
 818  Argument  : a seq name
 819
 820 =cut
 821
 822 sub eachSeqWithId {
 823     my $self = shift;
 824     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 825     $self->each_seq_with_id(@_);
 826 }
 827
 828 sub each_seq_with_id {
 829     my $self = shift;
 830     my $id = shift;
 831
 832     $self->throw("Method each_seq_with_id needs a sequence name argument")
 833         unless defined $id;
 834
 835     my (@arr, $seq);
 836
 837     if (exists($self->{'_start_end_lists'}->{$id})) {
 838         @arr = @{$self->{'_start_end_lists'}->{$id}};
 839     }
 840     return @arr;
 841 }
 842
 843 =head2 get_seq_by_pos
 844
 845  Title     : get_seq_by_pos
 846  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 847  Function  : Gets a sequence based on its position in the alignment.
 848              Numbering starts from 1.  Sequence positions larger than
 849              num_sequences() will throw an error.
 850  Returns   : a Bio::LocatableSeq object
 851  Args      : positive integer for the sequence position
 852
 853 =cut
 854
 855 sub get_seq_by_pos {
 856
 857     my $self = shift;
 858     my ($pos) = @_;
 859
 860     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 861         unless $pos =~ /^\d+$/ and $pos > 0;
 862     $self->throw("No sequence at position [$pos]")
 863         unless $pos <= $self->num_sequences ;
 864
 865     my $nse = $self->{'_order'}->{--$pos};
 866     return $self->{'_seq'}->{$nse};
 867 }
 868
 869 =head2 get_seq_by_id
 870
 871  Title     : get_seq_by_id
 872  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 873  Function  : Gets a sequence based on its name.
 874              Sequences that do not exist will warn and return undef
 875  Returns   : a Bio::LocatableSeq object
 876  Args      : string for sequence name
 877
 878 =cut
 879
 880 sub get_seq_by_id {
 881     my ($self,$name) = @_;
 882     unless( defined $name ) {
 883       $self->warn("Must provide a sequence name");
 884       return;
 885     }
 886     for my $seq ( values %{$self->{'_seq'}} ) {
 887       if ( $seq->id eq $name) {
 888         return $seq;
 889       }
 890     }
 891     return;
 892 }
 893
 894 =head2 seq_with_features
 895
 896  Title   : seq_with_features
 897  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 898                                           -consensus => 60
 899                                           -mask =>
 900            sub { my $consensus = shift;
 901
 902                  for my $i (1..5){
 903                     my $n = 'N' x $i;
 904                     my $q = '\?' x $i;
 905                     while($consensus =~ /[^?]$q[^?]/){
 906                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 907                     }
 908                   }
 909                  return $consensus;
 910                }
 911                                          );
 912  Function: produces a Bio::Seq object by first splicing gaps from -pos
 913            (by means of a splice_by_seq_pos() call), then creating
 914            features using non-? chars (by means of a consensus_string()
 915            call with stringency -consensus).
 916  Returns : a Bio::Seq object
 917  Args    : -pos : required. sequence from which to build the Bio::Seq
 918              object
 919            -consensus : optional, defaults to consensus_string()'s
 920              default cutoff value
 921            -mask : optional, a coderef to apply to consensus_string()'s
 922              output before building features.  this may be useful for
 923              closing gaps of 1 bp by masking over them with N, for
 924              instance
 925
 926 =cut
 927
 928 sub seq_with_features{
 929    my ($self,%arg) = @_;
 930
 931    #first do the preparatory splice
 932    $self->throw("must provide a -pos argument") unless $arg{-pos};
 933    $self->splice_by_seq_pos($arg{-pos});
 934
 935    my $consensus_string = $self->consensus_string($arg{-consensus});
 936    $consensus_string = $arg{-mask}->($consensus_string)
 937          if defined($arg{-mask});
 938
 939    my(@bs,@es);
 940
 941    push @bs, 1 if $consensus_string =~ /^[^?]/;
 942
 943    while($consensus_string =~ /\?[^?]/g){
 944          push @bs, pos($consensus_string);
 945    }
 946    while($consensus_string =~ /[^?]\?/g){
 947          push @es, pos($consensus_string);
 948    }
 949
 950    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 951
 952    my $seq = Bio::Seq->new();
 953
 954 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 955 #                -source_tag => 'location',
 956 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 957 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 958 #                                                  );
 959 #   $seq->add_SeqFeature($rootfeature);
 960
 961    while(my $b = shift @bs){
 962          my $e = shift @es;
 963          $seq->add_SeqFeature(
 964        Bio::SeqFeature::Generic->new(
 965          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 966          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 967          -source_tag => $self->source || 'MSA',
 968        )
 969      );
 970    }
 971
 972    return $seq;
 973 }
 974
 975
 976 =head1 Create new alignments
 977
 978 The result of these methods are horizontal or vertical subsets of the
 979 current MSA.
 980
 981 =head2 select
 982
 983  Title     : select
 984  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 985  Function  : Creates a new alignment from a continuous subset of
 986              sequences.  Numbering starts from 1.  Sequence positions
 987              larger than num_sequences() will throw an error.
 988  Returns   : a Bio::SimpleAlign object
 989  Args      : positive integer for the first sequence
 990              positive integer for the last sequence to include (optional)
 991
 992 =cut
 993
 994 sub select {
 995     my $self = shift;
 996     my ($start, $end) = @_;
 997
 998     $self->throw("Select start has to be a positive integer, not [$start]")
 999         unless $start =~ /^\d+$/ and $start > 0;
1000     $self->throw("Select end has to be a positive integer, not [$end]")
1001         unless $end  =~ /^\d+$/ and $end > 0;
1002     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003         unless $start <= $end;
1004
1005     my $aln = $self->new;
1006     foreach my $pos ($start .. $end) {
1007         $aln->add_seq($self->get_seq_by_pos($pos));
1008     }
1009     $aln->id($self->id);
1010     # fix for meta, sf, ann
1011     return $aln;
1012 }
1013
1014 =head2 select_noncont
1015
1016  Title     : select_noncont
1017  Usage     : # 1st and 3rd sequences, sorted
1018              $aln2 = $aln->select_noncont(1, 3)
1019
1020              # 1st and 3rd sequences, sorted (same as first)
1021              $aln2 = $aln->select_noncont(3, 1)
1022
1023              # 1st and 3rd sequences, unsorted
1024              $aln2 = $aln->select_noncont('nosort',3, 1)
1025
1026  Function  : Creates a new alignment from a subset of sequences.  Numbering
1027              starts from 1.  Sequence positions larger than num_sequences() will
1028              throw an error.  Sorts the order added to new alignment by default,
1029              to prevent sorting pass 'nosort' as the first argument in the list.
1030  Returns   : a Bio::SimpleAlign object
1031  Args      : array of integers for the sequences.  If the string 'nosort' is
1032              passed as the first argument, the sequences will not be sorted
1033              in the new alignment but will appear in the order listed.
1034
1035 =cut
1036
1037 sub select_noncont {
1038         my $self = shift;
1039     my $nosort = 0;
1040         my (@pos) = @_;
1041     if ($pos[0] !~ m{^\d+$}) {
1042         my $sortcmd = shift @pos;
1043         if ($sortcmd eq 'nosort') {
1044             $nosort = 1;
1045         } else {
1046             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1047         }
1048     }
1049
1050     my $end = $self->num_sequences;
1051     foreach ( @pos ) {
1052                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1054         }
1055
1056         @pos = sort {$a <=> $b} @pos unless $nosort;
1057
1058         my $aln = $self->new;
1059         foreach my $p (@pos) {
1060                 $aln->add_seq($self->get_seq_by_pos($p));
1061         }
1062         $aln->id($self->id);
1063     # fix for meta, sf, ann
1064         return $aln;
1065 }
1066
1067 =head2 select_noncont_by_name
1068
1069  Title     : select_noncont_by_name
1070  Usage     : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1071  Function  : Creates a new alignment from a subset of sequences which are
1072              selected by name (sequence ID).
1073  Returns   : a Bio::SimpleAlign object
1074  Args      : array of names (i.e., identifiers) for the sequences.
1075
1076 =cut
1077
1078 sub select_noncont_by_name {
1079     my ($self, @names) = @_;
1080
1081     my $aln = $self->new;
1082     foreach my $name (@names) {
1083         $aln->add_seq($self->get_seq_by_id($name));
1084     }
1085     $aln->id($self->id);
1086
1087     return $aln;
1088 }
1089
1090 =head2 slice
1091
1092  Title     : slice
1093  Usage     : $aln2 = $aln->slice(20,30)
1094  Function  : Creates a slice from the alignment inclusive of start and
1095              end columns, and the first column in the alignment is denoted 1.
1096              Sequences with no residues in the slice are excluded from the
1097              new alignment and a warning is printed. Slice beyond the length of
1098              the sequence does not do padding.
1099  Returns   : A Bio::SimpleAlign object
1100  Args      : Positive integer for start column, positive integer for end column,
1101              optional boolean which if true will keep gap-only columns in the newly
1102              created slice. Example:
1103
1104              $aln2 = $aln->slice(20,30,1)
1105
1106 =cut
1107
1108 sub slice {
1109         my $self = shift;
1110         my ($start, $end, $keep_gap_only) = @_;
1111
1112         $self->throw("Slice start has to be a positive integer, not [$start]")
1113           unless $start =~ /^\d+$/ and $start > 0;
1114         $self->throw("Slice end has to be a positive integer, not [$end]")
1115           unless $end =~ /^\d+$/ and $end > 0;
1116         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1117           unless $start <= $end;
1118         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1119                                          "[$start] is too big.") if $start > $self->length;
1120     my $cons_meta = $self->consensus_meta;
1121         my $aln = $self->new;
1122         $aln->id($self->id);
1123         foreach my $seq ( $self->each_seq() ) {
1124             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1125             Bio::Seq::Meta->new
1126         (-id      => $seq->id,
1127                  -alphabet => $seq->alphabet,
1128                  -strand  => $seq->strand,
1129                  -verbose => $self->verbose) :
1130             Bio::LocatableSeq->new
1131         (-id      => $seq->id,
1132                  -alphabet => $seq->alphabet,
1133                  -strand  => $seq->strand,
1134                  -verbose => $self->verbose);
1135
1136             # seq
1137             my $seq_end = $end;
1138             $seq_end = $seq->length if( $end > $seq->length );
1139
1140             my $slice_seq = $seq->subseq($start, $seq_end);
1141             $new_seq->seq( $slice_seq );
1142
1143             $slice_seq =~ s/\W//g;
1144
1145             if ($start > 1) {
1146             my $pre_start_seq = $seq->subseq(1, $start - 1);
1147             $pre_start_seq =~ s/\W//g;
1148             if (!defined($seq->strand)) {
1149                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1150             } elsif ($seq->strand < 0){
1151                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1152             } else {
1153                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1154             }
1155             } else {
1156             if ((defined $seq->strand)&&($seq->strand < 0)){
1157                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1158             } else {
1159                $new_seq->start( $seq->start);
1160             }
1161             }
1162         if ($new_seq->isa('Bio::Seq::MetaI')) {
1163             for my $meta_name ($seq->meta_names) {
1164                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1165             }
1166         }
1167             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1168
1169             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1170             $aln->add_seq($new_seq);
1171             } else {
1172             if( $keep_gap_only ) {
1173                 $aln->add_seq($new_seq);
1174             } else {
1175                 my $nse = $seq->get_nse();
1176                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1177                     " Sequence excluded from the new alignment.");
1178             }
1179             }
1180         }
1181     if ($cons_meta) {
1182         my $new = Bio::Seq::Meta->new();
1183         for my $meta_name ($cons_meta->meta_names) {
1184             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1185         }
1186         $aln->consensus_meta($new);
1187     }
1188     $aln->annotation($self->annotation);
1189     # fix for meta, sf, ann
1190         return $aln;
1191 }
1192
1193 =head2 remove_columns
1194
1195  Title     : remove_columns
1196  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1197              $aln2 = $aln->remove_columns([0,0],[6,8])
1198  Function  : Creates an aligment with columns removed corresponding to
1199              the specified type or by specifying the columns by number.
1200  Returns   : Bio::SimpleAlign object
1201  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1202              'all_gaps_columns') or array ref where the referenced array
1203              contains a pair of integers that specify a range.
1204              The first column is 0
1205
1206 =cut
1207
1208 sub remove_columns {
1209     my ($self,@args) = @_;
1210     @args || $self->throw("Must supply column ranges or column types");
1211     my $aln;
1212
1213     if ($args[0][0] =~ /^[a-z_]+$/i) {
1214         $aln = $self->_remove_columns_by_type($args[0]);
1215     } elsif ($args[0][0] =~ /^\d+$/) {
1216         $aln = $self->_remove_columns_by_num(\@args);
1217     } else {
1218         $self->throw("You must pass array references to remove_columns(), not @args");
1219     }
1220     # fix for meta, sf, ann
1221     $aln;
1222 }
1223
1224
1225 =head2 remove_gaps
1226
1227  Title     : remove_gaps
1228  Usage     : $aln2 = $aln->remove_gaps
1229  Function  : Creates an aligment with gaps removed
1230  Returns   : a Bio::SimpleAlign object
1231  Args      : a gap character(optional) if none specified taken
1232                 from $self->gap_char,
1233              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1234                         indicates that only all-gaps columns should be deleted
1235
1236 Used from method L<remove_columns> in most cases. Set gap character
1237 using L<gap_char()|gap_char>.
1238
1239 =cut
1240
1241 sub remove_gaps {
1242     my ($self,$gapchar,$all_gaps_columns) = @_;
1243     my $gap_line;
1244     if ($all_gaps_columns) {
1245         $gap_line = $self->all_gap_line($gapchar);
1246     } else {
1247         $gap_line = $self->gap_line($gapchar);
1248     }
1249     my $aln = $self->new;
1250
1251     my @remove;
1252     my $length = 0;
1253     my $del_char = $gapchar || $self->gap_char;
1254     # Do the matching to get the segments to remove
1255     while ($gap_line =~ m/[$del_char]/g) {
1256         my $start = pos($gap_line)-1;
1257         $gap_line =~ m/\G[$del_char]+/gc;
1258         my $end = pos($gap_line)-1;
1259
1260         #have to offset the start and end for subsequent removes
1261         $start-=$length;
1262         $end  -=$length;
1263         $length += ($end-$start+1);
1264         push @remove, [$start,$end];
1265     }
1266
1267     #remove the segments
1268     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1269     # fix for meta, sf, ann
1270     return $aln;
1271 }
1272
1273
1274 sub _remove_col {
1275     my ($self,$aln,$remove) = @_;
1276     my @new;
1277
1278     my $gap = $self->gap_char;
1279
1280     # splice out the segments and create new seq
1281     foreach my $seq($self->each_seq){
1282         my $new_seq = Bio::LocatableSeq->new(
1283                                              -id      => $seq->id,
1284                                              -alphabet=> $seq->alphabet,
1285                                              -strand  => $seq->strand,
1286                                              -verbose => $self->verbose);
1287         my $sequence = $seq->seq;
1288         foreach my $pair(@{$remove}){
1289             my $start = $pair->[0];
1290             my $end   = $pair->[1];
1291             $sequence = $seq->seq unless $sequence;
1292             my $orig = $sequence;
1293             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1294             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1295             $sequence = $head.$tail;
1296             # start
1297             unless (defined $new_seq->start) {
1298                 if ($start == 0) {
1299                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1300                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1301                 }
1302                 else {
1303                     my $start_adjust = $orig =~ /^$gap+/;
1304                     if ($start_adjust) {
1305                         $start_adjust = $+[0] == $start;
1306                     }
1307                     $new_seq->start($seq->start + $start_adjust);
1308                 }
1309             }
1310             # end
1311             if (($end + 1) >= CORE::length($orig)) {
1312                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1313                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1314             }
1315             else {
1316                 $new_seq->end($seq->end);
1317             }
1318         }
1319
1320         if ($new_seq->end < $new_seq->start) {
1321             # we removed all columns except for gaps: set to 0 to indicate no
1322             # sequence
1323             $new_seq->start(0);
1324             $new_seq->end(0);
1325         }
1326
1327         $new_seq->seq($sequence) if $sequence;
1328                 push @new, $new_seq;
1329     }
1330     # add the new seqs to the alignment
1331     foreach my $new(@new){
1332         $aln->add_seq($new);
1333     }
1334     # fix for meta, sf, ann
1335     return $aln;
1336 }
1337
1338 sub _remove_columns_by_type {
1339         my ($self,$type) = @_;
1340         my $aln = $self->new;
1341         my @remove;
1342
1343         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1344         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1345         my %matchchars = ( 'match'           => '\*',
1346                        'weak'             => '\.',
1347                        'strong'           => ':',
1348                        'mismatch'         => ' ',
1349                        'gaps'             => '',
1350                        'all_gaps_columns' => ''
1351                      );
1352         # get the characters to delete against
1353         my $del_char;
1354         foreach my $type (@{$type}){
1355                 $del_char.= $matchchars{$type};
1356         }
1357
1358         my $length = 0;
1359         my $match_line = $self->match_line;
1360         # do the matching to get the segments to remove
1361         if($del_char){
1362                 while($match_line =~ m/[$del_char]/g ){
1363                         my $start = pos($match_line)-1;
1364                         $match_line=~/\G[$del_char]+/gc;
1365                         my $end = pos($match_line)-1;
1366
1367                         #have to offset the start and end for subsequent removes
1368                         $start-=$length;
1369                         $end  -=$length;
1370                         $length += ($end-$start+1);
1371                         push @remove, [$start,$end];
1372                 }
1373         }
1374
1375         # remove the segments
1376         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1377         $aln = $aln->remove_gaps() if $gap;
1378         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1379     # fix for meta, sf, ann
1380         $aln;
1381 }
1382
1383
1384 sub _remove_columns_by_num {
1385         my ($self,$positions) = @_;
1386         my $aln = $self->new;
1387
1388         # sort the positions
1389         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1390
1391     my @remove;
1392     my $length = 0;
1393     foreach my $pos (@{$positions}) {
1394         my ($start, $end) = @{$pos};
1395
1396         #have to offset the start and end for subsequent removes
1397         $start-=$length;
1398         $end  -=$length;
1399         $length += ($end-$start+1);
1400         push @remove, [$start,$end];
1401     }
1402
1403     #remove the segments
1404     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1405     # fix for meta, sf, ann
1406         $aln;
1407 }
1408
1409
1410 =head1 Change sequences within the MSA
1411
1412 These methods affect characters in all sequences without changing the
1413 alignment.
1414
1415 =head2 splice_by_seq_pos
1416
1417  Title   : splice_by_seq_pos
1418  Usage   : $status = splice_by_seq_pos(1);
1419  Function: splices all aligned sequences where the specified sequence
1420            has gaps.
1421  Example :
1422  Returns : 1 on success
1423  Args    : position of sequence to splice by
1424
1425
1426 =cut
1427
1428 sub splice_by_seq_pos{
1429   my ($self,$pos) = @_;
1430
1431   my $guide = $self->get_seq_by_pos($pos);
1432   my $guide_seq = $guide->seq;
1433
1434   $guide_seq =~ s/\./\-/g;
1435
1436   my @gaps = ();
1437   $pos = -1;
1438   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1439     unshift @gaps, $pos;
1440     $pos++;
1441   }
1442
1443   foreach my $seq ($self->each_seq){
1444     my @bases = split '', $seq->seq;
1445
1446     splice(@bases, $_, 1) foreach @gaps;
1447     $seq->seq(join('', @bases));
1448   }
1449
1450   1;
1451 }
1452
1453 =head2 map_chars
1454
1455  Title     : map_chars
1456  Usage     : $ali->map_chars('\.','-')
1457  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1458              characters.
1459
1460              Note that the first argument is interpreted as a regexp
1461              so be careful and escape any wild card characters (e.g.
1462              do $ali->map_chars('\.','-') to replace periods with dashes.
1463  Returns   : 1 on success
1464  Argument  : A regexp and a string
1465
1466 =cut
1467
1468 sub map_chars {
1469     my $self = shift;
1470     my $from = shift;
1471     my $to   = shift;
1472     my ( $seq, $temp );
1473
1474     $self->throw("Need two arguments: a regexp and a string")
1475       unless defined $from and defined $to;
1476
1477     foreach $seq ( $self->each_seq() ) {
1478         $temp = $seq->seq();
1479         $temp =~ s/$from/$to/g;
1480         $seq->seq($temp);
1481     }
1482     return 1;
1483 }
1484
1485
1486 =head2 uppercase
1487
1488  Title     : uppercase()
1489  Usage     : $ali->uppercase()
1490  Function  : Sets all the sequences to uppercase
1491  Returns   : 1 on success
1492  Argument  :
1493
1494 =cut
1495
1496 sub uppercase {
1497     my $self = shift;
1498     my $seq;
1499     my $temp;
1500
1501     foreach $seq ( $self->each_seq() ) {
1502       $temp = $seq->seq();
1503       $temp =~ tr/[a-z]/[A-Z]/;
1504
1505       $seq->seq($temp);
1506     }
1507     return 1;
1508 }
1509
1510 =head2 cigar_line
1511
1512  Title    : cigar_line()
1513  Usage    : %cigars = $align->cigar_line()
1514  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1515             Report) line for each sequence in the alignment. Examples are
1516             "1,60" or "5,10:12,58", where the numbers refer to conserved
1517             positions within the alignment. The keys of the hash are the
1518             NSEs (name/start/end) assigned to each sequence.
1519  Args     : threshold (optional, defaults to 100)
1520  Returns  : Hash of strings (cigar lines)
1521
1522 =cut
1523
1524 sub cigar_line {
1525         my $self = shift;
1526         my $thr=shift||100;
1527         my %cigars;
1528
1529         my @consensus = split "",($self->consensus_string($thr));
1530         my $len = $self->length;
1531         my $gapchar = $self->gap_char;
1532
1533         # create a precursor, something like (1,4,5,6,7,33,45),
1534         # where each number corresponds to a conserved position
1535         foreach my $seq ( $self->each_seq ) {
1536                 my @seq = split "", uc ($seq->seq);
1537                 my $pos = 1;
1538                 for (my $x = 0 ; $x < $len ; $x++ ) {
1539                         if ($seq[$x] eq $consensus[$x]) {
1540                                 push @{$cigars{$seq->get_nse}},$pos;
1541                                 $pos++;
1542                         } elsif ($seq[$x] ne $gapchar) {
1543                                 $pos++;
1544                         }
1545                 }
1546         }
1547         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1548         for my $name (keys %cigars) {
1549                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1550                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1551       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1552            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1553                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1554                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1555                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1556                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1557                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1558                         }
1559       }
1560         }
1561   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1562   for my $name (keys %cigars) {
1563           my @remove;
1564           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1565                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1566                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1567                       unshift @remove,$x;
1568               }
1569            }
1570       for my $pos (@remove) {
1571                         splice @{$cigars{$name}}, $pos, 1;
1572            }
1573    }
1574    # join and punctuate
1575    for my $name (keys %cigars) {
1576           my ($start,$end,$str) = "";
1577           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1578                   $str .= ($start . "," . $end . ":");
1579           }
1580           $str =~ s/:$//;
1581       $cigars{$name} = $str;
1582    }
1583    %cigars;
1584 }
1585
1586
1587 =head2 match_line
1588
1589  Title    : match_line()
1590  Usage    : $line = $align->match_line()
1591  Function : Generates a match line - much like consensus string
1592             except that a line indicating the '*' for a match.
1593  Args     : (optional) Match line characters ('*' by default)
1594             (optional) Strong match char (':' by default)
1595             (optional) Weak match char ('.' by default)
1596  Returns  : String
1597
1598 =cut
1599
1600 sub match_line {
1601         my ($self,$matchlinechar, $strong, $weak) = @_;
1602         my %matchchars = ('match'    => $matchlinechar || '*',
1603                                                           'weak'     => $weak          || '.',
1604                                                           'strong'   => $strong        || ':',
1605                                                           'mismatch' => ' ',
1606                                                   );
1607
1608         my @seqchars;
1609         my $alphabet;
1610         foreach my $seq ( $self->each_seq ) {
1611                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1612                 $alphabet = $seq->alphabet unless defined $alphabet;
1613         }
1614         my $refseq = shift @seqchars;
1615         # let's just march down the columns
1616         my $matchline;
1617  POS:
1618         foreach my $pos ( 0..$self->length ) {
1619                 my $refchar = $refseq->[$pos];
1620                 my $char = $matchchars{'mismatch'};
1621                 unless( defined $refchar ) {
1622                         last if $pos == $self->length; # short circuit on last residue
1623                         # this in place to handle jason's soon-to-be-committed
1624                         # intron mapping code
1625                         goto bottom;
1626                 }
1627                 my %col = ($refchar => 1);
1628                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1629                 foreach my $seq ( @seqchars ) {
1630                         next if $pos >= scalar @$seq;
1631                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1632                                                           $seq->[$pos] eq ' ' );
1633                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1634                 }
1635                 my @colresidues = sort keys %col;
1636
1637                 # if all the values are the same
1638                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1639                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1640                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1641                         # matches for protein seqs
1642             TYPE:
1643                         foreach my $type ( qw(strong weak) ) {
1644                                 # iterate through categories
1645                                 my %groups;
1646                                 # iterate through each of the aa in the col
1647                                 # look to see which groups it is in
1648                                 foreach my $c ( @colresidues ) {
1649                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1650                                                 push @{$groups{$f}},$c;
1651                                         }
1652                                 }
1653                          GRP:
1654                                 foreach my $cols ( values %groups ) {
1655                                         @$cols = sort @$cols;
1656                                         # now we are just testing to see if two arrays
1657                                         # are identical w/o changing either one
1658                                         # have to be same len
1659                                         next if( scalar @$cols != scalar @colresidues );
1660                                         # walk down the length and check each slot
1661                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1662                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1663                                         }
1664                                         $char = $matchchars{$type};
1665                                         last TYPE;
1666                                 }
1667                         }
1668                 }
1669          bottom:
1670                 $matchline .= $char;
1671         }
1672         return $matchline;
1673 }
1674
1675
1676 =head2 gap_line
1677
1678  Title    : gap_line()
1679  Usage    : $line = $align->gap_line()
1680  Function : Generates a gap line - much like consensus string
1681             except that a line where '-' represents gap
1682  Args     : (optional) gap line characters ('-' by default)
1683  Returns  : string
1684
1685 =cut
1686
1687 sub gap_line {
1688     my ($self,$gapchar) = @_;
1689     $gapchar = $gapchar || $self->gap_char;
1690     my %gap_hsh; # column gaps vector
1691     foreach my $seq ( $self->each_seq ) {
1692                 my $i = 0;
1693         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1694                   map {[$i++, $_]} split(//, uc ($seq->seq));
1695     }
1696     my $gap_line;
1697     foreach my $pos ( 0..$self->length-1 ) {
1698           $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1699     }
1700     return $gap_line;
1701 }
1702
1703 =head2 all_gap_line
1704
1705  Title    : all_gap_line()
1706  Usage    : $line = $align->all_gap_line()
1707  Function : Generates a gap line - much like consensus string
1708             except that a line where '-' represents all-gap column
1709  Args     : (optional) gap line characters ('-' by default)
1710  Returns  : string
1711
1712 =cut
1713
1714 sub all_gap_line {
1715     my ($self,$gapchar) = @_;
1716     $gapchar = $gapchar || $self->gap_char;
1717     my %gap_hsh;                # column gaps counter hash
1718     my @seqs = $self->each_seq;
1719     foreach my $seq ( @seqs ) {
1720         my $i = 0;
1721         map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1722         map {[$i++, $_]} split(//, uc ($seq->seq));
1723     }
1724     my $gap_line;
1725     foreach my $pos ( 0..$self->length-1 ) {
1726         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1727             # gaps column
1728             $gap_line .= $self->gap_char;
1729         } else {
1730             $gap_line .= '.';
1731         }
1732     }
1733     return $gap_line;
1734 }
1735
1736 =head2 gap_col_matrix
1737
1738  Title    : gap_col_matrix()
1739  Usage    : my $cols = $align->gap_col_matrix()
1740  Function : Generates an array where each element in the array is a
1741             hash reference with a key of the sequence name and a
1742             value of 1 if the sequence has a gap at that column
1743  Returns  : Reference to an array
1744  Args     : Optional: gap line character ($aln->gap_char or '-' by default)
1745
1746 =cut
1747
1748 sub gap_col_matrix {
1749     my ( $self, $gapchar ) = @_;
1750     $gapchar = $gapchar || $self->gap_char;
1751     my %gap_hsh;    # column gaps vector
1752     my @cols;
1753     foreach my $seq ( $self->each_seq ) {
1754         my $i   = 0;
1755         my $str = $seq->seq;
1756         my $len = $seq->length;
1757         my $ch;
1758         my $id = $seq->display_id;
1759         while ( $i < $len ) {
1760             $ch = substr( $str, $i, 1 );
1761             $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1762         }
1763     }
1764     return \@cols;
1765 }
1766
1767 =head2 match
1768
1769  Title     : match()
1770  Usage     : $ali->match()
1771  Function  : Goes through all columns and changes residues that are
1772              identical to residue in first sequence to match '.'
1773              character. Sets match_char.
1774
1775              USE WITH CARE: Most MSA formats do not support match
1776              characters in sequences, so this is mostly for output
1777              only. NEXUS format (Bio::AlignIO::nexus) can handle
1778              it.
1779  Returns   : 1 on success
1780  Argument  : a match character, optional, defaults to '.'
1781
1782 =cut
1783
1784 sub match {
1785     my ( $self, $match ) = @_;
1786
1787     $match ||= '.';
1788     my ($matching_char) = $match;
1789     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/;    #';
1790     $self->map_chars( $matching_char, '-' );
1791
1792     my @seqs = $self->each_seq();
1793     return 1 unless scalar @seqs > 1;
1794
1795     my $refseq  = shift @seqs;
1796     my @refseq  = split //, $refseq->seq;
1797     my $gapchar = $self->gap_char;
1798
1799     foreach my $seq (@seqs) {
1800         my @varseq = split //, $seq->seq();
1801         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1802             $varseq[$i] = $match
1803                 if defined $refseq[$i]
1804                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1805                 || $refseq[$i] =~ /$gapchar/ )
1806                 && $refseq[$i] eq $varseq[$i];
1807         }
1808         $seq->seq( join '', @varseq );
1809     }
1810     $self->match_char($match);
1811     return 1;
1812 }
1813
1814
1815
1816 =head2 unmatch
1817
1818  Title     : unmatch()
1819  Usage     : $ali->unmatch()
1820  Function  : Undoes the effect of method match. Unsets match_char.
1821  Returns   : 1 on success
1822  Argument  : a match character, optional, defaults to '.'
1823
1824 See L<match> and L<match_char>
1825
1826 =cut
1827
1828 sub unmatch {
1829     my ( $self, $match ) = @_;
1830
1831     $match ||= '.';
1832
1833     my @seqs = $self->each_seq();
1834     return 1 unless scalar @seqs > 1;
1835
1836     my $refseq  = shift @seqs;
1837     my @refseq  = split //, $refseq->seq;
1838     my $gapchar = $self->gap_char;
1839     foreach my $seq (@seqs) {
1840         my @varseq = split //, $seq->seq();
1841         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1842             $varseq[$i] = $refseq[$i]
1843                 if defined $refseq[$i]
1844                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1845                 || $refseq[$i] =~ /$gapchar/ )
1846                 && $varseq[$i] eq $match;
1847         }
1848         $seq->seq( join '', @varseq );
1849     }
1850     $self->match_char('');
1851     return 1;
1852 }
1853
1854
1855 =head1 MSA attributes
1856
1857 Methods for setting and reading the MSA attributes.
1858
1859 Note that the methods defining character semantics depend on the user
1860 to set them sensibly.  They are needed only by certain input/output
1861 methods. Unset them by setting to an empty string ('').
1862
1863 =head2 id
1864
1865  Title     : id
1866  Usage     : $myalign->id("Ig")
1867  Function  : Gets/sets the id field of the alignment
1868  Returns   : An id string
1869  Argument  : An id string (optional)
1870
1871 =cut
1872
1873 sub id {
1874     my ( $self, $name ) = @_;
1875
1876     if ( defined($name) ) {
1877         $self->{'_id'} = $name;
1878     }
1879
1880     return $self->{'_id'};
1881 }
1882
1883 =head2 accession
1884
1885  Title     : accession
1886  Usage     : $myalign->accession("PF00244")
1887  Function  : Gets/sets the accession field of the alignment
1888  Returns   : An acc string
1889  Argument  : An acc string (optional)
1890
1891 =cut
1892
1893 sub accession {
1894     my ( $self, $acc ) = @_;
1895
1896     if ( defined($acc) ) {
1897         $self->{'_accession'} = $acc;
1898     }
1899
1900     return $self->{'_accession'};
1901 }
1902
1903 =head2 description
1904
1905  Title     : description
1906  Usage     : $myalign->description("14-3-3 proteins")
1907  Function  : Gets/sets the description field of the alignment
1908  Returns   : An description string
1909  Argument  : An description string (optional)
1910
1911 =cut
1912
1913 sub description {
1914     my ( $self, $name ) = @_;
1915
1916     if ( defined($name) ) {
1917         $self->{'_description'} = $name;
1918     }
1919
1920     return $self->{'_description'};
1921 }
1922
1923 =head2 missing_char
1924
1925  Title     : missing_char
1926  Usage     : $myalign->missing_char("?")
1927  Function  : Gets/sets the missing_char attribute of the alignment
1928              It is generally recommended to set it to 'n' or 'N'
1929              for nucleotides and to 'X' for protein.
1930  Returns   : An missing_char string,
1931  Argument  : An missing_char string (optional)
1932
1933 =cut
1934
1935 sub missing_char {
1936     my ( $self, $char ) = @_;
1937
1938     if ( defined $char ) {
1939         $self->throw("Single missing character, not [$char]!")
1940             if CORE::length($char) > 1;
1941         $self->{'_missing_char'} = $char;
1942     }
1943
1944     return $self->{'_missing_char'};
1945 }
1946
1947
1948 =head2 match_char
1949
1950  Title     : match_char
1951  Usage     : $myalign->match_char('.')
1952  Function  : Gets/sets the match_char attribute of the alignment
1953  Returns   : An match_char string,
1954  Argument  : An match_char string (optional)
1955
1956 =cut
1957
1958 sub match_char {
1959     my ( $self, $char ) = @_;
1960
1961     if ( defined $char ) {
1962         $self->throw("Single match character, not [$char]!")
1963             if CORE::length($char) > 1;
1964         $self->{'_match_char'} = $char;
1965     }
1966
1967     return $self->{'_match_char'};
1968 }
1969
1970 =head2 gap_char
1971
1972  Title     : gap_char
1973  Usage     : $myalign->gap_char('-')
1974  Function  : Gets/sets the gap_char attribute of the alignment
1975  Returns   : An gap_char string, defaults to '-'
1976  Argument  : An gap_char string (optional)
1977
1978 =cut
1979
1980 sub gap_char {
1981     my ( $self, $char ) = @_;
1982
1983     if ( defined $char || !defined $self->{'_gap_char'} ) {
1984         $char = '-' unless defined $char;
1985         $self->throw("Single gap character, not [$char]!")
1986             if CORE::length($char) > 1;
1987         $self->{'_gap_char'} = $char;
1988     }
1989     return $self->{'_gap_char'};
1990 }
1991
1992
1993 =head2 symbol_chars
1994
1995  Title   : symbol_chars
1996  Usage   : my @symbolchars = $aln->symbol_chars;
1997  Function: Returns all the seen symbols (other than gaps)
1998  Returns : array of characters that are the seen symbols
1999  Args    : boolean to include the gap/missing/match characters
2000
2001 =cut
2002
2003 sub symbol_chars{
2004    my ($self,$includeextra) = @_;
2005
2006    unless ($self->{'_symbols'}) {
2007        foreach my $seq ($self->each_seq) {
2008            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
2009        }
2010    }
2011    my %copy = %{$self->{'_symbols'}};
2012    if( ! $includeextra ) {
2013        foreach my $char ( $self->gap_char, $self->match_char,
2014                           $self->missing_char) {
2015            delete $copy{$char} if( defined $char );
2016        }
2017    }
2018    return keys %copy;
2019 }
2020
2021 =head1 Alignment descriptors
2022
2023 These read only methods describe the MSA in various ways.
2024
2025
2026 =head2 score
2027
2028  Title     : score
2029  Usage     : $str = $ali->score()
2030  Function  : get/set a score of the alignment
2031  Returns   : a score for the alignment
2032  Argument  : an optional score to set
2033
2034 =cut
2035
2036 sub score {
2037   my $self = shift;
2038   $self->{score} = shift if @_;
2039   return $self->{score};
2040 }
2041
2042 =head2 consensus_string
2043
2044  Title     : consensus_string
2045  Usage     : $str = $ali->consensus_string($threshold_percent)
2046  Function  : Makes a strict consensus
2047  Returns   : Consensus string
2048  Argument  : Optional threshold ranging from 0 to 100.
2049              The consensus residue has to appear at least threshold %
2050              of the sequences at a given location, otherwise a '?'
2051              character will be placed at that location.
2052              (Default value = 0%)
2053
2054 =cut
2055
2056 sub consensus_string {
2057     my $self      = shift;
2058     my $threshold = shift;
2059
2060     my $out = "";
2061     my $len = $self->length - 1;
2062
2063     foreach ( 0 .. $len ) {
2064         $out .= $self->_consensus_aa( $_, $threshold );
2065     }
2066     return $out;
2067 }
2068
2069
2070 =head2 consensus_conservation
2071
2072  Title     : consensus_conservation
2073  Usage     : @conservation = $ali->consensus_conservation();
2074  Function  : Conservation (as a percent) of each position of alignment
2075  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2076  Argument  :
2077
2078 =cut
2079
2080 sub consensus_conservation {
2081     my $self = shift;
2082     my @cons;
2083     my $num_sequences = $self->num_sequences;
2084     foreach my $point (0..$self->length-1) {
2085         my %hash = $self->_consensus_counts($point);
2086         # max frequency of a non-gap letter
2087         my $max = (sort {$b<=>$a} values %hash )[0];
2088         push @cons, 100 * $max / $num_sequences;
2089     }
2090     return @cons;
2091 }
2092
2093 sub _consensus_aa {
2094     my $self = shift;
2095     my $point = shift;
2096     my $threshold_percent = shift || -1 ;
2097     my ($seq,%hash,$count,$letter,$key);
2098     my $gapchar = $self->gap_char;
2099     %hash = $self->_consensus_counts($point);
2100     my $number_of_sequences = $self->num_sequences();
2101     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2102     $count = -1;
2103     $letter = '?';
2104
2105     foreach $key ( sort keys %hash ) {
2106         # print "Now at $key $hash{$key}\n";
2107         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2108             $letter = $key;
2109             $count = $hash{$key};
2110         }
2111     }
2112     return $letter;
2113 }
2114
2115 # Frequency of each letter in one column
2116 sub _consensus_counts {
2117     my $self = shift;
2118     my $point = shift;
2119     my %hash;
2120     my $gapchar = $self->gap_char;
2121     foreach my $seq ( $self->each_seq() ) {
2122         my $letter = substr($seq->seq,$point,1);
2123         $self->throw("--$point-----------") if $letter eq '';
2124         ($letter eq $gapchar || $letter =~ /\./) && next;
2125         $hash{$letter}++;
2126     }
2127     return %hash;
2128 }
2129
2130
2131 =head2 consensus_iupac
2132
2133  Title     : consensus_iupac
2134  Usage     : $str = $ali->consensus_iupac()
2135  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2136              and RNA. The output is in upper case except when gaps in
2137              a column force output to be in lower case.
2138
2139              Note that if your alignment sequences contain a lot of
2140              IUPAC ambiquity codes you often have to manually set
2141              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2142              indicate a protein sequence.
2143  Returns   : consensus string
2144  Argument  : none
2145  Throws    : on protein sequences
2146
2147 =cut
2148
2149 sub consensus_iupac {
2150     my $self = shift;
2151     my $out  = "";
2152     my $len  = $self->length - 1;
2153
2154     # only DNA and RNA sequences are valid
2155     foreach my $seq ( $self->each_seq() ) {
2156         $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2157             if $seq->alphabet eq 'protein';
2158     }
2159
2160     # loop over the alignment columns
2161     foreach my $count ( 0 .. $len ) {
2162         $out .= $self->_consensus_iupac($count);
2163     }
2164     return $out;
2165 }
2166
2167 sub _consensus_iupac {
2168     my ($self, $column) = @_;
2169     my ($string, $char, $rna);
2170
2171     #determine all residues in a column
2172     foreach my $seq ( $self->each_seq() ) {
2173         $string .= substr($seq->seq, $column, 1);
2174     }
2175     $string = uc $string;
2176
2177     # quick exit if there's an N in the string
2178     if ($string =~ /N/) {
2179         $string =~ /\W/ ? return 'n' : return 'N';
2180     }
2181     # ... or if there are only gap characters
2182     return '-' if $string =~ /^\W+$/;
2183
2184     # treat RNA as DNA in regexps
2185     if ($string =~ /U/) {
2186         $string =~ s/U/T/;
2187         $rna = 1;
2188     }
2189
2190     # the following s///'s only need to be done to the _first_ ambiguity code
2191     # as we only need to see the _range_ of characters in $string
2192
2193     if ($string =~ /[VDHB]/) {
2194         $string =~ s/V/AGC/;
2195         $string =~ s/D/AGT/;
2196         $string =~ s/H/ACT/;
2197         $string =~ s/B/CTG/;
2198     }
2199
2200     if ($string =~ /[SKYRWM]/) {
2201         $string =~ s/S/GC/;
2202         $string =~ s/K/GT/;
2203         $string =~ s/Y/CT/;
2204         $string =~ s/R/AG/;
2205         $string =~ s/W/AT/;
2206         $string =~ s/M/AC/;
2207     }
2208
2209     # and now the guts of the thing
2210
2211     if ($string =~ /A/) {
2212         $char = 'A';                     # A                      A
2213         if ($string =~ /G/) {
2214             $char = 'R';                 # A and G (purines)      R
2215             if ($string =~ /C/) {
2216                 $char = 'V';             # A and G and C          V
2217                 if ($string =~ /T/) {
2218                     $char = 'N';         # A and G and C and T    N
2219                 }
2220             } elsif ($string =~ /T/) {
2221                 $char = 'D';             # A and G and T          D
2222             }
2223         } elsif ($string =~ /C/) {
2224             $char = 'M';                 # A and C                M
2225             if ($string =~ /T/) {
2226                 $char = 'H';             # A and C and T          H
2227             }
2228         } elsif ($string =~ /T/) {
2229             $char = 'W';                 # A and T                W
2230         }
2231     } elsif ($string =~ /C/) {
2232         $char = 'C';                     # C                      C
2233         if ($string =~ /T/) {
2234             $char = 'Y';                 # C and T (pyrimidines)  Y
2235             if ($string =~ /G/) {
2236                 $char = 'B';             # C and T and G          B
2237             }
2238         } elsif ($string =~ /G/) {
2239             $char = 'S';                 # C and G                S
2240         }
2241     } elsif ($string =~ /G/) {
2242         $char = 'G';                     # G                      G
2243         if ($string =~ /C/) {
2244             $char = 'S';                 # G and C                S
2245         } elsif ($string =~ /T/) {
2246             $char = 'K';                 # G and T                K
2247         }
2248     } elsif ($string =~ /T/) {
2249         $char = 'T';                     # T                      T
2250     }
2251
2252     $char = 'U' if $rna and $char eq 'T';
2253     $char = lc $char if $string =~ /\W/;
2254
2255     return $char;
2256 }
2257
2258
2259 =head2 consensus_meta
2260
2261  Title     : consensus_meta
2262  Usage     : $seqmeta = $ali->consensus_meta()
2263  Function  : Returns a Bio::Seq::Meta object containing the consensus
2264              strings derived from meta data analysis.
2265  Returns   : Bio::Seq::Meta
2266  Argument  : Bio::Seq::Meta
2267  Throws    : non-MetaI object
2268
2269 =cut
2270
2271 sub consensus_meta {
2272     my ($self, $meta) = @_;
2273     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2274         $self->throw('Not a Bio::Seq::MetaI object');
2275     }
2276     return $self->{'_aln_meta'} = $meta if $meta;
2277     return $self->{'_aln_meta'}
2278 }
2279
2280 =head2 is_flush
2281
2282  Title     : is_flush
2283  Usage     : if ( $ali->is_flush() )
2284  Function  : Tells you whether the alignment
2285            : is flush, i.e. all of the same length
2286  Returns   : 1 or 0
2287  Argument  :
2288
2289 =cut
2290
2291 sub is_flush {
2292     my ( $self, $report ) = @_;
2293     my $seq;
2294     my $length = (-1);
2295     my $temp;
2296
2297     foreach $seq ( $self->each_seq() ) {
2298         if ( $length == (-1) ) {
2299             $length = CORE::length( $seq->seq() );
2300             next;
2301         }
2302
2303         $temp = CORE::length( $seq->seq() );
2304         if ( $temp != $length ) {
2305             $self->warn(
2306                 "expecting $length not $temp from " . $seq->display_id )
2307                 if ($report);
2308             $self->debug(
2309                 "expecting $length not $temp from " . $seq->display_id );
2310             $self->debug( $seq->seq() . "\n" );
2311             return 0;
2312         }
2313     }
2314
2315     return 1;
2316 }
2317
2318
2319 =head2 length
2320
2321  Title     : length()
2322  Usage     : $len = $ali->length()
2323  Function  : Returns the maximum length of the alignment.
2324              To be sure the alignment is a block, use is_flush
2325  Returns   : Integer
2326  Argument  :
2327
2328 =cut
2329
2330 sub length_aln {
2331     my $self = shift;
2332     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2333     $self->length(@_);
2334 }
2335
2336 sub length {
2337     my $self = shift;
2338     my $seq;
2339     my $length = -1;
2340     my $temp;
2341
2342     foreach $seq ( $self->each_seq() ) {
2343         $temp = $seq->length();
2344         if( $temp > $length ) {
2345             $length = $temp;
2346         }
2347     }
2348
2349     return $length;
2350 }
2351
2352
2353 =head2 maxdisplayname_length
2354
2355  Title     : maxdisplayname_length
2356  Usage     : $ali->maxdisplayname_length()
2357  Function  : Gets the maximum length of the displayname in the
2358              alignment. Used in writing out various MSA formats.
2359  Returns   : integer
2360  Argument  :
2361
2362 =cut
2363
2364 sub maxname_length {
2365     my $self = shift;
2366     $self->deprecated("maxname_length - deprecated method.".
2367                       " Use maxdisplayname_length() instead.");
2368     $self->maxdisplayname_length();
2369 }
2370
2371 sub maxnse_length {
2372     my $self = shift;
2373     $self->deprecated("maxnse_length - deprecated method.".
2374                       " Use maxnse_length() instead.");
2375     $self->maxdisplayname_length();
2376 }
2377
2378 sub maxdisplayname_length {
2379     my $self    = shift;
2380     my $maxname = (-1);
2381     my ( $seq, $len );
2382
2383     foreach $seq ( $self->each_seq() ) {
2384         $len = CORE::length $self->displayname( $seq->get_nse() );
2385
2386         if ( $len > $maxname ) {
2387             $maxname = $len;
2388         }
2389     }
2390
2391     return $maxname;
2392 }
2393
2394 =head2 max_metaname_length
2395
2396  Title     : max_metaname_length
2397  Usage     : $ali->max_metaname_length()
2398  Function  : Gets the maximum length of the meta name tags in the
2399              alignment for the sequences and for the alignment.
2400              Used in writing out various MSA formats.
2401  Returns   : integer
2402  Argument  : None
2403
2404 =cut
2405
2406 sub max_metaname_length {
2407     my $self = shift;
2408     my $maxname = (-1);
2409     my ($seq,$len);
2410
2411     # check seq meta first
2412     for $seq ( $self->each_seq() ) {
2413         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2414         for my $mtag ($seq->meta_names) {
2415             $len = CORE::length $mtag;
2416             if( $len > $maxname ) {
2417                 $maxname = $len;
2418             }
2419         }
2420     }
2421
2422     # alignment meta
2423     for my $meta ($self->consensus_meta) {
2424         next unless $meta;
2425         for my $name ($meta->meta_names) {
2426             $len = CORE::length $name;
2427             if( $len > $maxname ) {
2428                 $maxname = $len;
2429             }
2430         }
2431     }
2432
2433     return $maxname;
2434 }
2435
2436 =head2 num_residues
2437
2438  Title     : num_residues
2439  Usage     : $no = $ali->num_residues
2440  Function  : number of residues in total in the alignment
2441  Returns   : integer
2442  Argument  :
2443  Note      : replaces no_residues()
2444
2445 =cut
2446
2447 sub num_residues {
2448     my $self  = shift;
2449     my $count = 0;
2450
2451     foreach my $seq ( $self->each_seq ) {
2452         my $str = $seq->seq();
2453
2454         $count += ( $str =~ s/[A-Za-z]//g );
2455     }
2456
2457     return $count;
2458 }
2459
2460 =head2 num_sequences
2461
2462  Title     : num_sequences
2463  Usage     : $depth = $ali->num_sequences
2464  Function  : number of sequence in the sequence alignment
2465  Returns   : integer
2466  Argument  : none
2467  Note      : replaces no_sequences()
2468
2469 =cut
2470
2471 sub num_sequences {
2472     my $self = shift;
2473     return scalar($self->each_seq);
2474 }
2475
2476 =head2 average_percentage_identity
2477
2478  Title   : average_percentage_identity
2479  Usage   : $id = $align->average_percentage_identity
2480  Function: The function uses a fast method to calculate the average
2481            percentage identity of the alignment
2482  Returns : The average percentage identity of the alignment
2483  Args    : None
2484  Notes   : This method implemented by Kevin Howe calculates a figure that is
2485            designed to be similar to the average pairwise identity of the
2486            alignment (identical in the absence of gaps), without having to
2487            explicitly calculate pairwise identities proposed by Richard Durbin.
2488            Validated by Ewan Birney ad Alex Bateman.
2489
2490 =cut
2491
2492 sub average_percentage_identity{
2493    my ($self,@args) = @_;
2494
2495    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2496                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2497
2498    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2499
2500    if (! $self->is_flush()) {
2501        $self->throw("All sequences in the alignment must be the same length");
2502    }
2503
2504    @seqs = $self->each_seq();
2505    $len = $self->length();
2506
2507    # load the each hash with correct keys for existence checks
2508
2509    for( my $index=0; $index < $len; $index++) {
2510        foreach my $letter (@alphabet) {
2511            $countHashes[$index]->{$letter} = 0;
2512        }
2513    }
2514    foreach my $seq (@seqs)  {
2515        my @seqChars = split //, $seq->seq();
2516        for( my $column=0; $column < @seqChars; $column++ ) {
2517            my $char = uc($seqChars[$column]);
2518            if (exists $countHashes[$column]->{$char}) {
2519                $countHashes[$column]->{$char}++;
2520            }
2521        }
2522    }
2523
2524    $total = 0;
2525    $divisor = 0;
2526    for(my $column =0; $column < $len; $column++) {
2527        my %hash = %{$countHashes[$column]};
2528        $subdivisor = 0;
2529        foreach my $res (keys %hash) {
2530            $total += $hash{$res}*($hash{$res} - 1);
2531            $subdivisor += $hash{$res};
2532        }
2533        $divisor += $subdivisor * ($subdivisor - 1);
2534    }
2535    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2536 }
2537
2538 =head2 percentage_identity
2539
2540  Title   : percentage_identity
2541  Usage   : $id = $align->percentage_identity
2542  Function: The function calculates the average percentage identity
2543            (aliased to average_percentage_identity)
2544  Returns : The average percentage identity
2545  Args    : None
2546
2547 =cut
2548
2549 sub percentage_identity {
2550     my $self = shift;
2551     return $self->average_percentage_identity();
2552 }
2553
2554 =head2 overall_percentage_identity
2555
2556  Title   : overall_percentage_identity
2557  Usage   : $id = $align->overall_percentage_identity
2558            $id = $align->overall_percentage_identity('short')
2559  Function: The function calculates the percentage identity of
2560            the conserved columns
2561  Returns : The percentage identity of the conserved columns
2562  Args    : length value to use, optional defaults to alignment length
2563                  possible values: 'align', 'short', 'long'
2564
2565 The argument values 'short' and 'long' refer to shortest and longest
2566 sequence in the alignment. Method modification code by Hongyu Zhang.
2567
2568 =cut
2569
2570 sub overall_percentage_identity{
2571    my ($self, $length_measure) = @_;
2572
2573    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2574
2575    my %enum = map {$_ => undef} qw (align short long);
2576
2577    $self->throw("Unknown argument [$length_measure]")
2578        if $length_measure and not exists $enum{$length_measure};
2579    $length_measure ||= 'align';
2580
2581    if (! $self->is_flush()) {
2582        $self->throw("All sequences in the alignment must be the same length");
2583    }
2584
2585    # Count the residues seen at each position
2586    my $len;
2587    my $total = 0; # number of positions with identical residues
2588    my @countHashes;
2589    my @seqs = $self->each_seq;
2590    my $nof_seqs = scalar @seqs;
2591    my $aln_len = $self->length();
2592    for my $seq (@seqs)  {
2593        my $seqstr = $seq->seq;
2594
2595        # Count residues for given sequence
2596        for my $column (0 .. $aln_len-1) {
2597            my $char = uc( substr($seqstr, $column, 1) );
2598            if ( exists $alphabet{$char} ) {
2599
2600                # This is a valid char
2601                if ( defined $countHashes[$column]->{$char} ) {
2602                  $countHashes[$column]->{$char}++;
2603                } else {
2604                  $countHashes[$column]->{$char} = 1;
2605                }
2606
2607                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2608                    # All sequences have this same residue
2609                    $total++;
2610                }
2611
2612            }
2613        }
2614
2615        # Sequence length
2616        if ($length_measure eq 'short' || $length_measure eq 'long') {
2617            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2618            if ($length_measure eq 'short') {
2619                if ( (not defined $len) || ($seq_len < $len) ) {
2620                    $len = $seq_len;
2621                }
2622            } elsif ($length_measure eq 'long') {
2623                if ( (not defined $len) || ($seq_len > $len) ) {
2624                    $len = $seq_len;
2625                }
2626            }
2627        }
2628
2629    }
2630
2631    if ($length_measure eq 'align') {
2632        $len = $aln_len;
2633    }
2634
2635    return ($total / $len ) * 100.0;
2636 }
2637
2638
2639
2640 =head1 Alignment positions
2641
2642 Methods to map a sequence position into an alignment column and back.
2643 column_from_residue_number() does the former. The latter is really a
2644 property of the sequence object and can done using
2645 L<Bio::LocatableSeq::location_from_column>:
2646
2647     # select somehow a sequence from the alignment, e.g.
2648     my $seq = $aln->get_seq_by_pos(1);
2649     #$loc is undef or Bio::LocationI object
2650     my $loc = $seq->location_from_column(5);
2651
2652 =head2 column_from_residue_number
2653
2654  Title   : column_from_residue_number
2655  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2656  Function: This function gives the position in the alignment
2657            (i.e. column number) of the given residue number in the
2658            sequence with the given name. For example, for the
2659            alignment
2660
2661              Seq1/91-97 AC..DEF.GH.
2662              Seq2/24-30 ACGG.RTY...
2663                 Seq3/43-51 AC.DDEF.GHI
2664
2665            column_from_residue_number( "Seq1", 94 ) returns 6.
2666            column_from_residue_number( "Seq2", 25 ) returns 2.
2667            column_from_residue_number( "Seq3", 50 ) returns 10.
2668
2669            An exception is thrown if the residue number would lie
2670            outside the length of the aligment
2671            (e.g. column_from_residue_number( "Seq2", 22 )
2672
2673           Note: If the the parent sequence is represented by more than
2674                 one alignment sequence and the residue number is present in
2675                 them, this method finds only the first one.
2676
2677  Returns : A column number for the position in the alignment of the
2678            given residue in the given sequence (1 = first column)
2679  Args    : A sequence id/name (not a name/start-end)
2680            A residue number in the whole sequence (not just that
2681            segment of it in the alignment)
2682
2683 =cut
2684
2685 sub column_from_residue_number {
2686     my ( $self, $name, $resnumber ) = @_;
2687
2688     $self->throw("No sequence with name [$name]")
2689         unless $self->{'_start_end_lists'}->{$name};
2690     $self->throw("Second argument residue number missing") unless $resnumber;
2691
2692     foreach my $seq ( $self->each_seq_with_id($name) ) {
2693         my $col;
2694         eval { $col = $seq->column_from_residue_number($resnumber); };
2695         next if $@;
2696         return $col;
2697     }
2698
2699     $self->throw( "Could not find a sequence segment in $name "
2700             . "containing residue number $resnumber" );
2701
2702 }
2703
2704 =head1 Sequence names
2705
2706 Methods to manipulate the display name. The default name based on the
2707 sequence id and subsequence positions can be overridden in various
2708 ways.
2709
2710 =head2 displayname
2711
2712  Title     : displayname
2713  Usage     : $myalign->displayname("Ig", "IgA")
2714  Function  : Gets/sets the display name of a sequence in the alignment
2715  Returns   : A display name string
2716  Argument  : name of the sequence
2717              displayname of the sequence (optional)
2718
2719 =cut
2720
2721 sub displayname {
2722     my ( $self, $name, $disname ) = @_;
2723
2724     $self->throw("No sequence with name [$name]")
2725         unless defined $self->{'_seq'}->{$name};
2726
2727     if ( $disname and $name ) {
2728         $self->{'_dis_name'}->{$name} = $disname;
2729         return $disname;
2730     }
2731     elsif ( defined $self->{'_dis_name'}->{$name} ) {
2732         return $self->{'_dis_name'}->{$name};
2733     }
2734     else {
2735         return $name;
2736     }
2737 }
2738
2739 sub get_displayname {
2740     my $self = shift;
2741     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2742     $self->displayname(@_);
2743 }
2744
2745 sub set_displayname {
2746     my $self = shift;
2747     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2748     $self->displayname(@_);
2749 }
2750
2751
2752 =head2 set_displayname_count
2753
2754  Title     : set_displayname_count
2755  Usage     : $ali->set_displayname_count
2756  Function  : Sets the names to be name_# where # is the number of
2757              times this name has been used.
2758  Returns   : 1, on success
2759  Argument  :
2760
2761 =cut
2762
2763 sub set_displayname_count {
2764     my $self= shift;
2765     my (@arr,$name,$seq,$count,$temp,$nse);
2766
2767     foreach $seq ( $self->each_alphabetically() ) {
2768         $nse = $seq->get_nse();
2769
2770         #name will be set when this is the second
2771         #time (or greater) is has been seen
2772
2773         if( defined $name and $name eq ($seq->id()) ) {
2774             $temp = sprintf("%s_%s",$name,$count);
2775             $self->displayname($nse,$temp);
2776             $count++;
2777         } else {
2778             $count = 1;
2779             $name = $seq->id();
2780             $temp = sprintf("%s_%s",$name,$count);
2781             $self->displayname($nse,$temp);
2782             $count++;
2783         }
2784     }
2785     return 1;
2786 }
2787
2788 =head2 set_displayname_flat
2789
2790  Title     : set_displayname_flat
2791  Usage     : $ali->set_displayname_flat()
2792  Function  : Makes all the sequences be displayed as just their name,
2793              not name/start-end (NSE)
2794  Returns   : 1
2795  Argument  :
2796
2797 =cut
2798
2799 sub set_displayname_flat {
2800     my $self = shift;
2801     my ( $nse, $seq );
2802
2803     foreach $seq ( $self->each_seq() ) {
2804         $nse = $seq->get_nse();
2805         $self->displayname( $nse, $seq->id() );
2806     }
2807     return 1;
2808 }
2809
2810
2811 =head2 set_displayname_normal
2812
2813  Title     : set_displayname_normal
2814  Usage     : $ali->set_displayname_normal()
2815  Function  : Makes all the sequences be displayed as name/start-end (NSE)
2816  Returns   : 1, on success
2817  Argument  :
2818
2819 =cut
2820
2821 sub set_displayname_normal {
2822     my $self = shift;
2823     my ( $nse, $seq );
2824
2825     foreach $seq ( $self->each_seq() ) {
2826         $nse = $seq->get_nse();
2827         $self->displayname( $nse, $nse );
2828     }
2829     return 1;
2830 }
2831
2832 =head2 source
2833
2834  Title   : source
2835  Usage   : $obj->source($newval)
2836  Function: sets the Alignment source program
2837  Example :
2838  Returns : value of source
2839  Args    : newvalue (optional)
2840
2841 =cut
2842
2843 sub source {
2844     my ( $self, $value ) = @_;
2845     if ( defined $value ) {
2846         $self->{'_source'} = $value;
2847     }
2848     return $self->{'_source'};
2849 }
2850
2851
2852 =head2 set_displayname_safe
2853
2854  Title     : set_displayname_safe
2855  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2856  Function  : Assign machine-generated serial names to sequences in input order.
2857              Designed to protect names during PHYLIP runs. Assign 10-char string
2858              in the form of "S000000001" to "S999999999". Restore the original
2859              names using "restore_displayname".
2860  Returns   : 1. a new $aln with system names;
2861              2. a hash ref for restoring names
2862  Argument  : Number for id length (default 10)
2863
2864 =cut
2865
2866 sub set_displayname_safe {
2867     my $self = shift;
2868     my $idlength = shift || 10;
2869     my ( $seq, %phylip_name );
2870     my $ct  = 0;
2871     my $new = Bio::SimpleAlign->new();
2872     foreach $seq ( $self->each_seq() ) {
2873         $ct++;
2874         my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2875         $phylip_name{$pname} = $seq->id();
2876         my $new_seq = Bio::LocatableSeq->new(
2877             -id       => $pname,
2878             -seq      => $seq->seq(),
2879             -alphabet => $seq->alphabet,
2880             -start    => $seq->{_start},
2881             -end      => $seq->{_end}
2882         );
2883         $new->add_seq($new_seq);
2884     }
2885
2886     $self->debug(
2887         "$ct seq names changed. Restore names by using restore_displayname.");
2888     return ( $new, \%phylip_name );
2889 }
2890
2891
2892 =head2 restore_displayname
2893
2894  Title     : restore_displayname
2895  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2896  Function  : Restore original sequence names (after running
2897              $ali->set_displayname_safe)
2898  Returns   : a new $aln with names restored.
2899  Argument  : a hash reference of names from "set_displayname_safe".
2900
2901 =cut
2902
2903 sub restore_displayname {
2904     my $self = shift;
2905     my $ref=shift;
2906     my %name=%$ref;
2907     my $new=Bio::SimpleAlign->new();
2908     foreach my $seq ( $self->each_seq() ) {
2909       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2910       my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2911                                           -seq      => $seq->seq(),
2912                                           -alphabet => $seq->alphabet,
2913                                           -start    => $seq->{_start},
2914                                           -end      => $seq->{_end}
2915                                           );
2916       $new->add_seq($new_seq);
2917     }
2918     return $new;
2919 }
2920
2921 =head2 sort_by_start
2922
2923  Title     : sort_by_start
2924  Usage     : $ali->sort_by_start
2925  Function  : Changes the order of the alignment to the start position of each
2926              subalignment
2927  Returns   : 1 on success
2928  Argument  :
2929
2930 =cut
2931
2932 sub sort_by_start {
2933     my $self = shift;
2934     my ($seq,$nse,@arr,%hash,$count);
2935     foreach $seq ( $self->each_seq() ) {
2936         $nse = $seq->get_nse;
2937         $hash{$nse} = $seq;
2938     }
2939     $count = 0;
2940     %{$self->{'_order'}} = (); # reset the hash;
2941     foreach $nse ( sort _startend keys %hash) {
2942         $self->{'_order'}->{$count} = $nse;
2943         $count++;
2944     }
2945     1;
2946 }
2947
2948 sub _startend {
2949     my ($aname,$arange) = split (/[\/]/,$a);
2950     my ($bname,$brange) = split (/[\/]/,$b);
2951     my ($astart,$aend) = split(/\-/,$arange);
2952     my ($bstart,$bend) = split(/\-/,$brange);
2953     return $astart <=> $bstart;
2954 }
2955
2956 =head2 bracket_string
2957
2958  Title     : bracket_string
2959  Usage     : my @params = (-refseq     => 'testseq',
2960                            -allele1    => 'allele1',
2961                            -allele2    => 'allele2',
2962                            -delimiters => '{}',
2963                            -separator  => '/');
2964              $str = $aln->bracket_string(@params)
2965
2966  Function :  When supplied with a list of parameters (see below), returns a
2967              string in BIC format. This is used for allelic comparisons.
2968              Briefly, if either allele contains a base change when compared to
2969              the refseq, the base or gap for each allele is represented in
2970              brackets in the order present in the 'alleles' parameter.
2971
2972              For the following data:
2973
2974              >testseq
2975              GGATCCATTGCTACT
2976              >allele1
2977              GGATCCATTCCTACT
2978              >allele2
2979              GGAT--ATTCCTCCT
2980
2981              the returned string with parameters 'refseq => testseq' and
2982              'alleles => [qw(allele1 allele2)]' would be:
2983
2984              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2985  Returns   : BIC-formatted string
2986  Argument  : Required args
2987                 refseq    : string (ID) of the reference sequence used
2988                             as basis for comparison
2989                 allele1   : string (ID) of the first allele
2990                 allele2   : string (ID) of the second allele
2991              Optional args
2992                 delimiters: two symbol string of left and right delimiters.
2993                             Only the first two symbols are used
2994                             default = '[]'
2995                 separator : string used as a separator.  Only the first
2996                             symbol is used
2997                             default = '/'
2998  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2999
3000 =cut
3001
3002 sub bracket_string {
3003     my ($self, @args) = @_;
3004     my ($ref, $a1, $a2, $delim, $sep) =
3005         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
3006     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
3007     my ($ld, $rd);
3008     ($ld, $rd) = split('', $delim, 2) if $delim;
3009     $ld ||= '[';
3010     $rd ||= ']';
3011     $sep ||= '/';
3012     my ($refseq, $allele1, $allele2) =
3013         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
3014     if (!$refseq || !$allele1 || !$allele2) {
3015         $self->throw("One of your refseq/allele IDs is invalid!");
3016     }
3017     my $len = $self->length-1;
3018     my $bic = '';
3019     # loop over the alignment columns
3020     for my $column ( 0 .. $len ) {
3021         my $string;
3022         my ($compres, $res1, $res2) =
3023             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3024         # are any of the allele symbols different from the refseq?
3025         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3026                 $ld.$res1.$sep.$res2.$rd;
3027         $bic .= $string;
3028     }
3029     return $bic;
3030 }
3031
3032
3033 =head2 methods implementing Bio::FeatureHolderI
3034
3035 FeatureHolderI implementation to support labeled character sets like one
3036 would get from NEXUS represented data.
3037
3038 =head2 get_SeqFeatures
3039
3040  Usage   : @features = $aln->get_SeqFeatures
3041  Function: Get the feature objects held by this feature holder.
3042  Example :
3043  Returns : an array of Bio::SeqFeatureI implementing objects
3044  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3045          : as argument, returning TRUE if wanted, FALSE if
3046          : unwanted
3047
3048 =cut
3049
3050 sub get_SeqFeatures {
3051     my $self      = shift;
3052     my $filter_cb = shift;
3053     $self->throw("Arg (filter callback) must be a coderef")
3054         unless !defined($filter_cb)
3055         or ref($filter_cb) eq 'CODE';
3056     if ( !defined $self->{'_as_feat'} ) {
3057         $self->{'_as_feat'} = [];
3058     }
3059     if ($filter_cb) {
3060         return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
3061     }
3062     return @{ $self->{'_as_feat'} };
3063 }
3064
3065
3066 =head2 add_SeqFeature
3067
3068  Usage   : $aln->add_SeqFeature($subfeat);
3069  Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3070            (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3071  Example :
3072  Returns : 1 on success
3073  Args    : a Bio::SeqFeatureI object
3074
3075 =cut
3076
3077 sub add_SeqFeature {
3078    my ($self, @feat) = @_;
3079
3080    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3081
3082    if (scalar @feat > 1) {
3083       $self->deprecated(
3084          -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3085                      ' is deprecated and will be removed in a future version. '.
3086                      'Add a single feature at a time instead.',
3087          -warn_version    => 1.007,
3088          -throw_version   => 1.009,
3089       );
3090    }
3091
3092    for my $feat ( @feat ) {
3093
3094        next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3095
3096        if( !$feat->isa("Bio::SeqFeatureI") ) {
3097            $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3098        }
3099
3100        push @{$self->{'_as_feat'}}, $feat;
3101    }
3102    return 1;
3103 }
3104
3105
3106 =head2 remove_SeqFeatures
3107
3108  Usage   : $obj->remove_SeqFeatures
3109  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3110            remove that subset from the returned array, and add back the rest.
3111  Returns : The array of Bio::SeqFeatureI features that was
3112            deleted from this alignment.
3113  Args    : none
3114
3115 =cut
3116
3117 sub remove_SeqFeatures {
3118     my $self = shift;
3119
3120     return () unless $self->{'_as_feat'};
3121     my @feats = @{$self->{'_as_feat'}};
3122     $self->{'_as_feat'} = [];
3123     return @feats;
3124 }
3125
3126 =head2 feature_count
3127
3128  Title   : feature_count
3129  Usage   : $obj->feature_count()
3130  Function: Return the number of SeqFeatures attached to the alignment
3131  Returns : integer representing the number of SeqFeatures
3132  Args    : None
3133
3134 =cut
3135
3136 sub feature_count {
3137     my ($self) = @_;
3138
3139     if (defined($self->{'_as_feat'})) {
3140         return ($#{$self->{'_as_feat'}} + 1);
3141     } else {
3142         return 0;
3143     }
3144 }
3145
3146 =head2 get_all_SeqFeatures
3147
3148  Title   : get_all_SeqFeatures
3149  Usage   :
3150  Function: Get all SeqFeatures.
3151  Example :
3152  Returns : an array of Bio::SeqFeatureI implementing objects
3153  Args    : none
3154  Note    : Falls through to Bio::FeatureHolderI implementation.
3155
3156 =cut
3157
3158 =head2 methods for Bio::AnnotatableI
3159
3160 AnnotatableI implementation to support sequence alignments which
3161 contain annotation (NEXUS, Stockholm).
3162
3163 =head2 annotation
3164
3165  Title   : annotation
3166  Usage   : $ann = $aln->annotation or
3167            $aln->annotation($ann)
3168  Function: Gets or sets the annotation
3169  Returns : Bio::AnnotationCollectionI object
3170  Args    : None or Bio::AnnotationCollectionI object
3171
3172 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3173 for more information
3174
3175 =cut
3176
3177 sub annotation {
3178     my ($obj,$value) = @_;
3179     if( defined $value ) {
3180         $obj->throw("object of class ".ref($value)." does not implement ".
3181                 "Bio::AnnotationCollectionI. Too bad.")
3182             unless $value->isa("Bio::AnnotationCollectionI");
3183         $obj->{'_annotation'} = $value;
3184     } elsif( ! defined $obj->{'_annotation'}) {
3185         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3186     }
3187     return $obj->{'_annotation'};
3188 }
3189
3190 =head1 Deprecated methods
3191
3192 =cut
3193
3194 =head2 no_residues
3195
3196  Title     : no_residues
3197  Usage     : $no = $ali->no_residues
3198  Function  : number of residues in total in the alignment
3199  Returns   : integer
3200  Argument  :
3201  Note      : deprecated in favor of num_residues()
3202
3203 =cut
3204
3205 sub no_residues {
3206         my $self = shift;
3207         $self->deprecated(-warn_version => 1.0069,
3208                                           -throw_version => 1.0075,
3209                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3210   $self->num_residues(@_);
3211 }
3212
3213 =head2 no_sequences
3214
3215  Title     : no_sequences
3216  Usage     : $depth = $ali->no_sequences
3217  Function  : number of sequence in the sequence alignment
3218  Returns   : integer
3219  Argument  :
3220  Note      : deprecated in favor of num_sequences()
3221
3222 =cut
3223
3224 sub no_sequences {
3225         my $self = shift;
3226         $self->deprecated(-warn_version => 1.0069,
3227                                           -throw_version => 1.0075,
3228                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3229     $self->num_sequences(@_);
3230 }
3231
3232 =head2 mask_columns
3233
3234  Title     : mask_columns
3235  Usage     : $aln2 = $aln->mask_columns(20,30)
3236  Function  : Masks a slice of the alignment inclusive of start and
3237              end columns, and the first column in the alignment is denoted 1.
3238              Mask beyond the length of the sequence does not do padding.
3239  Returns   : A Bio::SimpleAlign object
3240  Args      : Positive integer for start column, positive integer for end column,
3241              optional string value use for the mask. Example:
3242
3243              $aln2 = $aln->mask_columns(20,30,'?')
3244  Note      : Masking must use a character that is not used for gaps or
3245              frameshifts.  These can be adjusted using the relevant global
3246              variables, but be aware these may be (uncontrollably) modified
3247              elsewhere within BioPerl (see bug 2715)
3248
3249 =cut
3250
3251 sub mask_columns {
3252     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3253     my $self = shift;
3254
3255     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3256              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3257
3258     # coordinates are alignment-based, not sequence-based
3259     my ($start, $end, $mask_char) = @_;
3260     unless (defined $mask_char) { $mask_char = 'N' }
3261
3262     $self->throw("Mask start has to be a positive integer and less than ".
3263                  "alignment length, not [$start]")
3264       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3265     $self->throw("Mask end has to be a positive integer and less than ".
3266                  "alignment length, not [$end]")
3267       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3268     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3269                  "end [$end]") unless $start <= $end;
3270     $self->throw("Mask character $mask_char has to be a single character ".
3271                  "and not a gap or frameshift symbol")
3272       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3273
3274     my $aln = $self->new;
3275     $aln->id($self->id);
3276     foreach my $seq ( $self->each_seq() ) {
3277         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3278          -alphabet => $seq->alphabet,
3279          -strand  => $seq->strand,
3280          -verbose => $self->verbose);
3281
3282         # convert from 1-based alignment coords!
3283         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3284         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3285         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3286         $new_seq->seq($new_dna_string);
3287         $aln->add_seq($new_seq);
3288     }
3289     return $aln;
3290 }
3291
3292 1;