Bio/SimpleAlign.pm

   1 package Bio::SimpleAlign;
   2 use strict;
   3 use warnings;
   4 use Bio::LocatableSeq;  # uses Seq's as list
   5 use Bio::Seq;
   6 use Bio::SeqFeature::Generic;
   7
   8 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
   9
  10 # BioPerl module for SimpleAlign
  11 #
  12 # Please direct questions and support issues to <bioperl-l@bioperl.org>
  13 #
  14 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
  15 #
  16 # Copyright Ewan Birney
  17 #
  18 # You may distribute this module under the same terms as perl itself
  19
  20 # POD documentation - main docs before the code
  21 #
  22 #  History:
  23 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  24 #       May 2001 major rewrite - Heikki Lehvaslaiho
  25
  26 =head1 NAME
  27
  28 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  29
  30 =head1 SYNOPSIS
  31
  32   # Use Bio::AlignIO to read in the alignment
  33   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  34   $aln = $str->next_aln();
  35
  36   # Describe
  37   print $aln->length;
  38   print $aln->num_residues;
  39   print $aln->is_flush;
  40   print $aln->num_sequences;
  41   print $aln->score;
  42   print $aln->percentage_identity;
  43   print $aln->consensus_string(50);
  44
  45   # Find the position in the alignment for a sequence location
  46   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  47
  48   # Extract sequences and check values for the alignment column $pos
  49   foreach $seq ($aln->each_seq) {
  50       $res = $seq->subseq($pos, $pos);
  51       $count{$res}++;
  52   }
  53   foreach $res (keys %count) {
  54       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  55   }
  56
  57   # Manipulate
  58   $aln->remove_seq($seq);
  59   $mini_aln = $aln->slice(20,30);  # get a block of columns
  60   $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
  61   $new_aln = $aln->remove_columns([20,30]); # remove by position
  62   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  63
  64   # Analyze
  65   $str = $aln->consensus_string($threshold_percent);
  66   $str = $aln->match_line();
  67   $str = $aln->cigar_line();
  68   $id = $aln->percentage_identity;
  69
  70   # See the module documentation for details and more methods.
  71
  72 =head1 DESCRIPTION
  73
  74 SimpleAlign is an object that handles a multiple sequence alignment
  75 (MSA). It is very permissive of types (it does not insist on sequences
  76 being all same length, for example). Think of it as a set of sequences
  77 with a whole series of built-in manipulations and methods for reading and
  78 writing alignments.
  79
  80 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  81 to store its sequences. These are subsequences with a start and end
  82 positions in the parent reference sequence. Each sequence in the
  83 SimpleAlign object is a Bio::LocatableSeq.
  84
  85 SimpleAlign expects the combination of name, start, and end for a
  86 given sequence to be unique in the alignment, and this is the key for the
  87 internal hashes (name, start, end are abbreviated C<nse> in the code).
  88 However, in some cases people do not want the name/start-end to be displayed:
  89 either multiple names in an alignment or names specific to the alignment
  90 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  91 C<displayname>, and generally is what is used to print out the
  92 alignment. They default to name/start-end.
  93
  94 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  95
  96 =head1 FEEDBACK
  97
  98 =head2 Mailing Lists
  99
 100 User feedback is an integral part of the evolution of this and other
 101 Bioperl modules.  Send your comments and suggestions preferably to one
 102 of the Bioperl mailing lists.  Your participation is much appreciated.
 103
 104   bioperl-l@bioperl.org                  - General discussion
 105   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 106
 107 =head2 Support
 108
 109 Please direct usage questions or support issues to the mailing list:
 110
 111 I<bioperl-l@bioperl.org>
 112
 113 rather than to the module maintainer directly. Many experienced and
 114 reponsive experts will be able look at the problem and quickly
 115 address it. Please include a thorough description of the problem
 116 with code and data examples if at all possible.
 117
 118 =head2 Reporting Bugs
 119
 120 Report bugs to the Bioperl bug tracking system to help us keep track
 121 the bugs and their resolution. Bug reports can be submitted via the
 122 web:
 123
 124   https://github.com/bioperl/bioperl-live/issues
 125
 126 =head1 AUTHOR
 127
 128 Ewan Birney, birney@ebi.ac.uk
 129
 130 =head1 CONTRIBUTORS
 131
 132 Allen Day, allenday-at-ucla.edu,
 133 Richard Adams, Richard.Adams-at-ed.ac.uk,
 134 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 135 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 136 Allen Smith, allens-at-cpan.org,
 137 Jason Stajich, jason-at-bioperl.org,
 138 Anthony Underwood, aunderwood-at-phls.org.uk,
 139 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 140 Brian Osborne, bosborne at alum.mit.edu
 141 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 142 Hongyu Zhang, forward at hongyu.org
 143 Jay Hannah, jay at jays.net
 144 Alexandr Bezginov, albezg at gmail.com
 145
 146 =head1 SEE ALSO
 147
 148 L<Bio::LocatableSeq>
 149
 150 =head1 APPENDIX
 151
 152 The rest of the documentation details each of the object
 153 methods. Internal methods are usually preceded with a _
 154
 155 =cut
 156
 157 ## This data should probably be in a more centralized module...
 158 ## it is taken from Clustalw documentation.
 159 ## These are all the positively scoring groups that occur in the
 160 ## Gonnet Pam250 matrix. The strong and weak groups are
 161 ## defined as strong score >0.5 and weak score =<0.5 respectively.
 162 our %CONSERVATION_GROUPS = (
 163   'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
 164   'weak'   => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
 165 );
 166
 167
 168 =head2 new
 169
 170  Title     : new
 171  Usage     : my $aln = Bio::SimpleAlign->new();
 172  Function  : Creates a new simple align object
 173  Returns   : Bio::SimpleAlign
 174  Args      : -source     => string representing the source program
 175                             where this alignment came from
 176              -annotation => Bio::AnnotationCollectionI
 177              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 178              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 179              -consensus  => consensus string
 180              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 181
 182 =cut
 183
 184
 185 sub new {
 186   my($class,@args) = @_;
 187
 188   my $self = $class->SUPER::new(@args);
 189
 190   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 191                                             SOURCE
 192                                             SCORE
 193                                             ID
 194                                             ACCESSION
 195                                             DESCRIPTION
 196                                             SEQS
 197                                             FEATURES
 198                                             ANNOTATION
 199                                             SEQ_ANNOTATION
 200                                             CONSENSUS
 201                                             CONSENSUS_META
 202                                             )], @args);
 203   $src && $self->source($src);
 204   defined $score && $self->score($score);
 205   # we need to set up internal hashs first!
 206
 207   $self->{'_seq'} = {};
 208   $self->{'_order'} = {};
 209   $self->{'_start_end_lists'} = {};
 210   $self->{'_dis_name'} = {};
 211   $self->{'_id'} = 'NoName';
 212   # maybe we should automatically read in from args. Hmmm...
 213   $id  && $self->id($id);
 214   $acc && $self->accession($acc);
 215   $desc && $self->description($desc);
 216   $coll && $self->annotation($coll);
 217   # sequence annotation is layered into a provided annotation collection (or dies)
 218   if ($sa) {
 219     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 220                  "with a sequence annotation collection")
 221         if !$coll;
 222     $coll->add_Annotation('seq_annotation', $sa);
 223   }
 224   if ($feats && ref $feats eq 'ARRAY') {
 225     for my $feat (@$feats) {
 226         $self->add_SeqFeature($feat);
 227     }
 228   }
 229   $con && $self->consensus($con);
 230   $cmeta && $self->consensus_meta($cmeta);
 231   # assumes these are in correct alignment order
 232   if ($seqs && ref($seqs) eq 'ARRAY') {
 233     for my $seq (@$seqs) {
 234         $self->add_seq($seq);
 235     }
 236   }
 237
 238   return $self; # success - we hope!
 239 }
 240
 241 =head1 Modifier methods
 242
 243 These methods modify the MSA by adding, removing or shuffling complete
 244 sequences.
 245
 246 =head2 add_seq
 247
 248  Title     : add_seq
 249  Usage     : $myalign->add_seq($newseq);
 250              $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
 251  Function  : Adds another sequence to the alignment. *Does not* align
 252              it - just adds it to the hashes.
 253              If -ORDER is specified, the sequence is inserted at the
 254              the position spec'd by -ORDER, and existing sequences
 255              are pushed down the storage array.
 256  Returns   : nothing
 257  Args      : A Bio::LocatableSeq object
 258              Positive integer for the sequence position (optional)
 259
 260 See L<Bio::LocatableSeq> for more information
 261
 262 =cut
 263
 264 sub addSeq {
 265     my $self = shift;
 266     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 267     $self->add_seq(@_);
 268 }
 269
 270 sub add_seq {
 271     my $self = shift;
 272     my @args = @_;
 273     my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
 274     my ($name,$id,$start,$end);
 275
 276     unless ($seq) {
 277         $self->throw("LocatableSeq argument required");
 278     }
 279     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 280         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 281     }
 282     !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
 283     $order--; # jay's patch (user-specified order is 1-origin)
 284
 285     if ($order < 0) {
 286         $self->throw("User-specified value for ORDER must be >= 1");
 287     }
 288
 289     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 290
 291     # build the symbol list for this sequence,
 292     # will prune out the gap and missing/match chars
 293     # when actually asked for the symbol list in the
 294     # symbol_chars
 295     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 296
 297     $name = $seq->get_nse;
 298
 299     if( $self->{'_seq'}->{$name} ) {
 300         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 301     }
 302     else {
 303         $self->debug( "Assigning $name to $order\n");
 304
 305     my $ordh = $self->{'_order'};
 306     if ($ordh->{$order}) {
 307         # make space to insert
 308         # $c->() returns (in reverse order) the first subsequence
 309         # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
 310         # (3,2,1), and $c->(2,4,5) returns (2).
 311         my $c;
 312         $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
 313         map {
 314      $ordh->{$_+1} = $ordh->{$_}
 315         } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
 316
 317     }
 318     $ordh->{$order} = $name;
 319
 320         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 321             $self->{'_start_end_lists'}->{$id} = [];
 322         }
 323         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 324     }
 325
 326     $self->{'_seq'}->{$name} = $seq;
 327
 328 }
 329
 330
 331 =head2 remove_seq
 332
 333  Title     : remove_seq
 334  Usage     : $aln->remove_seq($seq);
 335  Function  : Removes a single sequence from an alignment
 336  Returns   :
 337  Argument  : a Bio::LocatableSeq object
 338
 339 =cut
 340
 341 sub removeSeq {
 342     my $self = shift;
 343     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 344     $self->remove_seq(@_);
 345 }
 346
 347 sub remove_seq {
 348     my $self = shift;
 349     my $seq = shift;
 350     my ($name,$id);
 351
 352     $self->throw("Need Bio::Locatable seq argument ")
 353         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 354
 355     $id = $seq->id();
 356     $name = $seq->get_nse;
 357
 358     if( !exists $self->{'_seq'}->{$name} ) {
 359         $self->throw("Sequence $name does not exist in the alignment to remove!");
 360     }
 361
 362     delete $self->{'_seq'}->{$name};
 363
 364     # we need to remove this seq from the start_end_lists hash
 365
 366     if (exists $self->{'_start_end_lists'}->{$id}) {
 367         # we need to find the sequence in the array.
 368
 369         my ($i, $found);;
 370         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 371             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 372                 $found = 1;
 373                 last;
 374             }
 375         }
 376         if ($found) {
 377             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 378         }
 379         else {
 380             $self->throw("Could not find the sequence to remoce from the start-end list");
 381         }
 382     }
 383     else {
 384         $self->throw("There is no seq list for the name $id");
 385     }
 386     # we need to shift order hash
 387     my %rev_order = reverse %{$self->{'_order'}};
 388     my $no = $rev_order{$name};
 389     my $num_sequences = $self->num_sequences;
 390     for (; $no < $num_sequences; $no++) {
 391        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 392     }
 393     delete $self->{'_order'}->{$no};
 394     return 1;
 395 }
 396
 397
 398 =head2 purge
 399
 400  Title   : purge
 401  Usage   : $aln->purge(0.7);
 402  Function: Removes sequences above given sequence similarity
 403            This function will grind on large alignments. Beware!
 404  Example :
 405  Returns : An array of the removed sequences
 406  Args    : float, threshold for similarity
 407
 408 =cut
 409
 410 sub purge {
 411         my ($self,$perc) = @_;
 412         my (%duplicate, @dups);
 413
 414         my @seqs = $self->each_seq();
 415
 416         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 417                 my $seq = $seqs[$i];
 418
 419                 #skip if already in duplicate hash
 420                 next if exists $duplicate{$seq->display_id} ;
 421                 my $one = $seq->seq();
 422
 423                 my @one = split '', $one;       #split to get 1aa per array element
 424
 425                 for (my $j=$i+1;$j < @seqs;$j++) {
 426                         my $seq2 = $seqs[$j];
 427
 428                         #skip if already in duplicate hash
 429                         next if exists $duplicate{$seq2->display_id} ;
 430
 431                         my $two = $seq2->seq();
 432                         my @two = split '', $two;
 433
 434                         my $count = 0;
 435                         my $res = 0;
 436                         for (my $k=0;$k<@one;$k++) {
 437                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 438                                           $one[$k] eq $two[$k]) {
 439                                         $count++;
 440                                 }
 441                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 442                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 443                                         $res++;
 444                                 }
 445                         }
 446
 447                         my $ratio = 0;
 448                         $ratio = $count/$res unless $res == 0;
 449
 450                         # if above threshold put in duplicate hash and push onto
 451                         # duplicate array for returning to get_unique
 452                         if ( $ratio > $perc ) {
 453                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 454                                 $duplicate{$seq2->display_id} = 1;
 455                                 push @dups, $seq2;
 456                         }
 457                 }
 458         }
 459         foreach my $seq (@dups) {
 460                 $self->remove_seq($seq);
 461         }
 462         return @dups;
 463 }
 464
 465 =head2 sort_alphabetically
 466
 467  Title     : sort_alphabetically
 468  Usage     : $ali->sort_alphabetically
 469  Function  : Changes the order of the alignment to alphabetical on name
 470              followed by numerical by number.
 471  Returns   :
 472  Argument  :
 473
 474 =cut
 475
 476 sub sort_alphabetically {
 477     my $self = shift;
 478     my ($seq,$nse,@arr,%hash,$count);
 479
 480     foreach $seq ( $self->each_seq() ) {
 481         $nse = $seq->get_nse;
 482         $hash{$nse} = $seq;
 483     }
 484
 485     $count = 0;
 486
 487     %{$self->{'_order'}} = (); # reset the hash;
 488
 489     foreach $nse ( sort _alpha_startend keys %hash) {
 490         $self->{'_order'}->{$count} = $nse;
 491
 492         $count++;
 493     }
 494     1;
 495 }
 496
 497 =head2 sort_by_list
 498
 499  Title     : sort_by_list
 500  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 501  Function  : Arbitrarily order sequences in an alignment
 502  Returns   : A new Bio::SimpleAlign object
 503  Argument  : a file listing sequence names in intended order (one name per line)
 504
 505 =cut
 506
 507 sub sort_by_list {
 508     my ($self, $list) = @_;
 509     my (@seq, @ids, %order);
 510
 511     foreach my $seq ( $self->each_seq() ) {
 512         push @seq, $seq;
 513         push @ids, $seq->display_id;
 514     }
 515
 516     my $ct=1;
 517     open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
 518     while (<$listfh>) {
 519       chomp;
 520       my $name=$_;
 521       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 522       $order{$name}=$ct++;
 523     }
 524     close($listfh);
 525
 526     # use the map-sort-map idiom:
 527     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 528     my $aln = $self->new;
 529     foreach (@sorted) { $aln->add_seq($_) }
 530     return $aln;
 531 }
 532
 533 =head2 set_new_reference
 534
 535  Title     : set_new_reference
 536  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 537              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 538              as the reference (1st) sequence
 539  Function  : Change/Set a new reference (i.e., the first) sequence
 540  Returns   : a new Bio::SimpleAlign object.
 541              Throws an exception if designated sequence not found
 542  Argument  : a positive integer of sequence order, or a sequence name
 543              in the original alignment
 544
 545 =cut
 546
 547 sub set_new_reference {
 548     my ($self, $seqid) = @_;
 549     my $aln = $self->new;
 550     my (@seq, @ids, @new_seq);
 551     my $is_num=0;
 552     foreach my $seq ( $self->each_seq() ) {
 553         push @seq, $seq;
 554         push @ids, $seq->display_id;
 555     }
 556
 557     if ($seqid =~ /^\d+$/) { # argument is seq position
 558         $is_num=1;
 559         $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
 560     } else { # argument is a seq name
 561         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 562     }
 563
 564     for (my $i=0; $i<=$#seq; $i++) {
 565         my $pos=$i+1;
 566         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 567             unshift @new_seq, $seq[$i];
 568         } else {
 569             push @new_seq, $seq[$i];
 570         }
 571     }
 572     foreach (@new_seq) { $aln->add_seq($_);  }
 573     return $aln;
 574 }
 575
 576 sub _in_aln {  # check if input name exists in the alignment
 577     my ($str, $ref) = @_;
 578     foreach (@$ref) {
 579         return 1 if $str eq $_;
 580     }
 581     return 0;
 582 }
 583
 584
 585 =head2 uniq_seq
 586
 587  Title     : uniq_seq
 588  Usage     : $aln->uniq_seq():  Remove identical sequences in
 589              in the alignment.  Ambiguous base ("N", "n") and
 590              leading and ending gaps ("-") are NOT counted as
 591              differences.
 592  Function  : Make a new alignment of unique sequence types (STs)
 593  Returns   : 1a. if called in a scalar context,
 594                 a new Bio::SimpleAlign object (all sequences renamed as "ST")
 595              1b. if called in an array context,
 596                 a new Bio::SimpleAlign object, and a hashref whose keys
 597                 are sequence types, and whose values are arrayrefs to
 598                 lists of sequence ids within the corresponding sequence type
 599              2. if $aln->verbose > 0, ST of each sequence is sent to
 600                 STDERR (in a tabular format)
 601  Argument  : None
 602
 603 =cut
 604
 605 sub uniq_seq {
 606     my ($self, $seqid) = @_;
 607     my $aln = $self->new;
 608     my (%member, %order, @seq, @uniq_str, $st);
 609     my $order=0;
 610     my $len = $self->length();
 611     $st = {};
 612     foreach my $seq ( $self->each_seq() ) {
 613         my $str = $seq->seq();
 614
 615 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 616 # comparing two sequence strings
 617
 618     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 619         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 620     # 2nd, convert leading and ending gaps to "?":
 621         $str = &_convert_leading_ending_gaps($str, '-', '?');
 622     # Note that '?' also can mean unknown residue.
 623     # I don't like making global class member changes like this, too
 624     # prone to errors... -- cjfields 08-11-18
 625     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 626         my $new = Bio::LocatableSeq->new(
 627                      -id      => $seq->id(),
 628                                          -alphabet=> $seq->alphabet,
 629                                          -seq     => $str,
 630                                          -start   => $seq->start,
 631                                          -end     => $seq->end
 632                                          );
 633         push @seq, $new;
 634     }
 635
 636     foreach my $seq (@seq) {
 637         my $str = $seq->seq();
 638         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 639         if ($seen) { # seen before
 640             my @memb = @{$member{$key}};
 641             push @memb, $seq;
 642             $member{$key} = \@memb;
 643         } else {  # not seen
 644             push @uniq_str, $key;
 645             $order++;
 646             $member{$key} = [ ($seq) ];
 647             $order{$key} = $order;
 648         }
 649     }
 650
 651     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 652 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 653         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 654 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 655         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 656         my $gap='-';
 657         my $end= CORE::length($str2);
 658         $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
 659         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 660                                          -seq  =>$str2,
 661                                          -start=>1,
 662                                          -end  =>$end
 663                                          );
 664         $aln->add_seq($new);
 665         foreach (@{$member{$str}}) {
 666             push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
 667         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 668         }
 669     }
 670     return wantarray ? ($aln, $st) : $aln;
 671 }
 672
 673 sub _check_uniq {  # check if same seq exists in the alignment
 674     my ($str1, $ref, $length) = @_;
 675     my @char1=split //, $str1;
 676     my @array=@$ref;
 677
 678     return (0, $str1) if @array==0; # not seen (1st sequence)
 679
 680     foreach my $str2 (@array) {
 681         my $diff=0;
 682         my @char2=split //, $str2;
 683         for (my $i=0; $i<=$length-1; $i++) {
 684             next if $char1[$i] eq '?';
 685             next if $char2[$i] eq '?';
 686             $diff++ if $char1[$i] ne $char2[$i];
 687         }
 688         return (1, $str2) if $diff == 0;  # seen before
 689     }
 690
 691     return (0, $str1); # not seen
 692 }
 693
 694 sub _convert_leading_ending_gaps {
 695     my $s=shift;
 696     my $sym1=shift;
 697     my $sym2=shift;
 698     my @array=split //, $s;
 699 # convert leading char:
 700     for (my $i=0; $i<=$#array; $i++) {
 701         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 702     }
 703 # convert ending char:
 704     for (my $i = $#array; $i>= 0; $i--) {
 705         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 706     }
 707     my $s_new=join '', @array;
 708     return $s_new;
 709 }
 710
 711 =head1 Sequence selection methods
 712
 713 Methods returning one or more sequences objects.
 714
 715 =head2 each_seq
 716
 717  Title     : each_seq
 718  Usage     : foreach $seq ( $align->each_seq() )
 719  Function  : Gets a Seq object from the alignment
 720  Returns   : Seq object
 721  Argument  :
 722
 723 =cut
 724
 725 sub eachSeq {
 726     my $self = shift;
 727     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 728     $self->each_seq();
 729 }
 730
 731 sub each_seq {
 732         my $self = shift;
 733         my (@arr,$order);
 734
 735         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 736                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 737                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 738                 }
 739         }
 740         return @arr;
 741 }
 742
 743
 744 =head2 each_alphabetically
 745
 746  Title     : each_alphabetically
 747  Usage     : foreach $seq ( $ali->each_alphabetically() )
 748  Function  : Returns a sequence object, but the objects are returned
 749              in alphabetically sorted order.
 750              Does not change the order of the alignment.
 751  Returns   : Seq object
 752  Argument  :
 753
 754 =cut
 755
 756 sub each_alphabetically {
 757         my $self = shift;
 758         my ($seq,$nse,@arr,%hash,$count);
 759
 760         foreach $seq ( $self->each_seq() ) {
 761                 $nse = $seq->get_nse;
 762                 $hash{$nse} = $seq;
 763         }
 764
 765         foreach $nse ( sort _alpha_startend keys %hash) {
 766                 push(@arr,$hash{$nse});
 767         }
 768         return @arr;
 769 }
 770
 771 sub _alpha_startend {
 772     my ($aname,$astart,$bname,$bstart);
 773     ($aname,$astart) = split (/-/,$a);
 774     ($bname,$bstart) = split (/-/,$b);
 775
 776     if( $aname eq $bname ) {
 777         return $astart <=> $bstart;
 778     }
 779     else {
 780         return $aname cmp $bname;
 781     }
 782 }
 783
 784 =head2 each_seq_with_id
 785
 786  Title     : each_seq_with_id
 787  Usage     : foreach $seq ( $align->each_seq_with_id() )
 788  Function  : Gets a Seq objects from the alignment, the contents
 789              being those sequences with the given name (there may be
 790              more than one)
 791  Returns   : Seq object
 792  Argument  : a seq name
 793
 794 =cut
 795
 796 sub eachSeqWithId {
 797     my $self = shift;
 798     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 799     $self->each_seq_with_id(@_);
 800 }
 801
 802 sub each_seq_with_id {
 803     my $self = shift;
 804     my $id = shift;
 805
 806     $self->throw("Method each_seq_with_id needs a sequence name argument")
 807         unless defined $id;
 808
 809     my (@arr, $seq);
 810
 811     if (exists($self->{'_start_end_lists'}->{$id})) {
 812         @arr = @{$self->{'_start_end_lists'}->{$id}};
 813     }
 814     return @arr;
 815 }
 816
 817 =head2 get_seq_by_pos
 818
 819  Title     : get_seq_by_pos
 820  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 821  Function  : Gets a sequence based on its position in the alignment.
 822              Numbering starts from 1.  Sequence positions larger than
 823              num_sequences() will throw an error.
 824  Returns   : a Bio::LocatableSeq object
 825  Args      : positive integer for the sequence position
 826
 827 =cut
 828
 829 sub get_seq_by_pos {
 830
 831     my $self = shift;
 832     my ($pos) = @_;
 833
 834     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 835         unless $pos =~ /^\d+$/ and $pos > 0;
 836     $self->throw("No sequence at position [$pos]")
 837         unless $pos <= $self->num_sequences ;
 838
 839     my $nse = $self->{'_order'}->{--$pos};
 840     return $self->{'_seq'}->{$nse};
 841 }
 842
 843 =head2 get_seq_by_id
 844
 845  Title     : get_seq_by_id
 846  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 847  Function  : Gets a sequence based on its name.
 848              Sequences that do not exist will warn and return undef
 849  Returns   : a Bio::LocatableSeq object
 850  Args      : string for sequence name
 851
 852 =cut
 853
 854 sub get_seq_by_id {
 855     my ($self,$name) = @_;
 856     unless( defined $name ) {
 857       $self->warn("Must provide a sequence name");
 858       return;
 859     }
 860     for my $seq ( values %{$self->{'_seq'}} ) {
 861       if ( $seq->id eq $name) {
 862         return $seq;
 863       }
 864     }
 865     return;
 866 }
 867
 868 =head2 seq_with_features
 869
 870  Title   : seq_with_features
 871  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 872                                           -consensus => 60
 873                                           -mask =>
 874            sub { my $consensus = shift;
 875
 876                  for my $i (1..5){
 877                     my $n = 'N' x $i;
 878                     my $q = '\?' x $i;
 879                     while($consensus =~ /[^?]$q[^?]/){
 880                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 881                     }
 882                   }
 883                  return $consensus;
 884                }
 885                                          );
 886  Function: produces a Bio::Seq object by first splicing gaps from -pos
 887            (by means of a splice_by_seq_pos() call), then creating
 888            features using non-? chars (by means of a consensus_string()
 889            call with stringency -consensus).
 890  Returns : a Bio::Seq object
 891  Args    : -pos : required. sequence from which to build the Bio::Seq
 892              object
 893            -consensus : optional, defaults to consensus_string()'s
 894              default cutoff value
 895            -mask : optional, a coderef to apply to consensus_string()'s
 896              output before building features.  this may be useful for
 897              closing gaps of 1 bp by masking over them with N, for
 898              instance
 899
 900 =cut
 901
 902 sub seq_with_features{
 903    my ($self,%arg) = @_;
 904
 905    #first do the preparatory splice
 906    $self->throw("must provide a -pos argument") unless $arg{-pos};
 907    $self->splice_by_seq_pos($arg{-pos});
 908
 909    my $consensus_string = $self->consensus_string($arg{-consensus});
 910    $consensus_string = $arg{-mask}->($consensus_string)
 911          if defined($arg{-mask});
 912
 913    my(@bs,@es);
 914
 915    push @bs, 1 if $consensus_string =~ /^[^?]/;
 916
 917    while($consensus_string =~ /\?[^?]/g){
 918          push @bs, pos($consensus_string);
 919    }
 920    while($consensus_string =~ /[^?]\?/g){
 921          push @es, pos($consensus_string);
 922    }
 923
 924    push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
 925
 926    my $seq = Bio::Seq->new();
 927
 928 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 929 #                -source_tag => 'location',
 930 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 931 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 932 #                                                  );
 933 #   $seq->add_SeqFeature($rootfeature);
 934
 935    while(my $b = shift @bs){
 936          my $e = shift @es;
 937          $seq->add_SeqFeature(
 938        Bio::SeqFeature::Generic->new(
 939          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 940          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 941          -source_tag => $self->source || 'MSA',
 942        )
 943      );
 944    }
 945
 946    return $seq;
 947 }
 948
 949
 950 =head1 Create new alignments
 951
 952 The result of these methods are horizontal or vertical subsets of the
 953 current MSA.
 954
 955 =head2 select
 956
 957  Title     : select
 958  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 959  Function  : Creates a new alignment from a continuous subset of
 960              sequences.  Numbering starts from 1.  Sequence positions
 961              larger than num_sequences() will throw an error.
 962  Returns   : a Bio::SimpleAlign object
 963  Args      : positive integer for the first sequence
 964              positive integer for the last sequence to include (optional)
 965
 966 =cut
 967
 968 sub select {
 969     my $self = shift;
 970     my ($start, $end) = @_;
 971
 972     $self->throw("Select start has to be a positive integer, not [$start]")
 973         unless $start =~ /^\d+$/ and $start > 0;
 974     $self->throw("Select end has to be a positive integer, not [$end]")
 975         unless $end  =~ /^\d+$/ and $end > 0;
 976     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
 977         unless $start <= $end;
 978
 979     my $aln = $self->new;
 980     foreach my $pos ($start .. $end) {
 981         $aln->add_seq($self->get_seq_by_pos($pos));
 982     }
 983     $aln->id($self->id);
 984     # fix for meta, sf, ann
 985     return $aln;
 986 }
 987
 988 =head2 select_noncont
 989
 990  Title     : select_noncont
 991  Usage     : # 1st and 3rd sequences, sorted
 992              $aln2 = $aln->select_noncont(1, 3)
 993
 994              # 1st and 3rd sequences, sorted (same as first)
 995              $aln2 = $aln->select_noncont(3, 1)
 996
 997              # 1st and 3rd sequences, unsorted
 998              $aln2 = $aln->select_noncont('nosort',3, 1)
 999
1000  Function  : Creates a new alignment from a subset of sequences.  Numbering
1001              starts from 1.  Sequence positions larger than num_sequences() will
1002              throw an error.  Sorts the order added to new alignment by default,
1003              to prevent sorting pass 'nosort' as the first argument in the list.
1004  Returns   : a Bio::SimpleAlign object
1005  Args      : array of integers for the sequences.  If the string 'nosort' is
1006              passed as the first argument, the sequences will not be sorted
1007              in the new alignment but will appear in the order listed.
1008
1009 =cut
1010
1011 sub select_noncont {
1012         my $self = shift;
1013     my $nosort = 0;
1014         my (@pos) = @_;
1015     if ($pos[0] !~ m{^\d+$}) {
1016         my $sortcmd = shift @pos;
1017         if ($sortcmd eq 'nosort') {
1018             $nosort = 1;
1019         } else {
1020             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1021         }
1022     }
1023
1024     my $end = $self->num_sequences;
1025     foreach ( @pos ) {
1026                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1027                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1028         }
1029
1030         @pos = sort {$a <=> $b} @pos unless $nosort;
1031
1032         my $aln = $self->new;
1033         foreach my $p (@pos) {
1034                 $aln->add_seq($self->get_seq_by_pos($p));
1035         }
1036         $aln->id($self->id);
1037     # fix for meta, sf, ann
1038         return $aln;
1039 }
1040
1041 =head2 select_noncont_by_name
1042
1043  Title     : select_noncont_by_name
1044  Usage     : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045  Function  : Creates a new alignment from a subset of sequences which are
1046              selected by name (sequence ID).
1047  Returns   : a Bio::SimpleAlign object
1048  Args      : array of names (i.e., identifiers) for the sequences.
1049
1050 =cut
1051
1052 sub select_noncont_by_name {
1053     my ($self, @names) = @_;
1054
1055     my $aln = $self->new;
1056     foreach my $name (@names) {
1057         $aln->add_seq($self->get_seq_by_id($name));
1058     }
1059     $aln->id($self->id);
1060
1061     return $aln;
1062 }
1063
1064 =head2 slice
1065
1066  Title     : slice
1067  Usage     : $aln2 = $aln->slice(20,30)
1068  Function  : Creates a slice from the alignment inclusive of start and
1069              end columns, and the first column in the alignment is denoted 1.
1070              Sequences with no residues in the slice are excluded from the
1071              new alignment and a warning is printed. Slice beyond the length of
1072              the sequence does not do padding.
1073  Returns   : A Bio::SimpleAlign object
1074  Args      : Positive integer for start column, positive integer for end column,
1075              optional boolean which if true will keep gap-only columns in the newly
1076              created slice. Example:
1077
1078              $aln2 = $aln->slice(20,30,1)
1079
1080 =cut
1081
1082 sub slice {
1083         my $self = shift;
1084         my ($start, $end, $keep_gap_only) = @_;
1085
1086         $self->throw("Slice start has to be a positive integer, not [$start]")
1087           unless $start =~ /^\d+$/ and $start > 0;
1088         $self->throw("Slice end has to be a positive integer, not [$end]")
1089           unless $end =~ /^\d+$/ and $end > 0;
1090         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091           unless $start <= $end;
1092         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093                                          "[$start] is too big.") if $start > $self->length;
1094     my $cons_meta = $self->consensus_meta;
1095         my $aln = $self->new;
1096         $aln->id($self->id);
1097         foreach my $seq ( $self->each_seq() ) {
1098             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099             Bio::Seq::Meta->new
1100         (-id      => $seq->id,
1101                  -alphabet => $seq->alphabet,
1102                  -strand  => $seq->strand,
1103                  -verbose => $self->verbose) :
1104             Bio::LocatableSeq->new
1105         (-id      => $seq->id,
1106                  -alphabet => $seq->alphabet,
1107                  -strand  => $seq->strand,
1108                  -verbose => $self->verbose);
1109
1110             # seq
1111             my $seq_end = $end;
1112             $seq_end = $seq->length if( $end > $seq->length );
1113
1114             my $slice_seq = $seq->subseq($start, $seq_end);
1115             $new_seq->seq( $slice_seq );
1116
1117         # Allowed extra characters in string
1118         my $allowed_chars = '';
1119         if (exists $self->{_mask_char}) {
1120             $allowed_chars = $self->{_mask_char};
1121             $allowed_chars = quotemeta $allowed_chars;
1122         }
1123         $slice_seq =~ s/[^\w$allowed_chars]//g;
1124
1125         if ($start > 1) {
1126             my $pre_start_seq = $seq->subseq(1, $start - 1);
1127             $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128             if (!defined($seq->strand)) {
1129                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130             } elsif ($seq->strand < 0){
1131                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132             } else {
1133                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1134             }
1135             } else {
1136             if ((defined $seq->strand)&&($seq->strand < 0)){
1137                 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138             } else {
1139                $new_seq->start( $seq->start);
1140             }
1141             }
1142         if ($new_seq->isa('Bio::Seq::MetaI')) {
1143             for my $meta_name ($seq->meta_names) {
1144                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145             }
1146         }
1147             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148
1149             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150             $aln->add_seq($new_seq);
1151             } else {
1152             if( $keep_gap_only ) {
1153                 $aln->add_seq($new_seq);
1154             } else {
1155                 my $nse = $seq->get_nse();
1156                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157                     " Sequence excluded from the new alignment.");
1158             }
1159             }
1160         }
1161     if ($cons_meta) {
1162         my $new = Bio::Seq::Meta->new();
1163         for my $meta_name ($cons_meta->meta_names) {
1164             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165         }
1166         $aln->consensus_meta($new);
1167     }
1168     $aln->annotation($self->annotation);
1169     # fix for meta, sf, ann
1170         return $aln;
1171 }
1172
1173 =head2 remove_columns
1174
1175  Title     : remove_columns
1176  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177              $aln2 = $aln->remove_columns([0,0],[6,8])
1178  Function  : Creates an aligment with columns removed corresponding to
1179              the specified type or by specifying the columns by number.
1180  Returns   : Bio::SimpleAlign object
1181  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182              'all_gaps_columns') or array ref where the referenced array
1183              contains a pair of integers that specify a range.
1184              The first column is 0
1185
1186 =cut
1187
1188 sub remove_columns {
1189     my ($self,@args) = @_;
1190     @args || $self->throw("Must supply column ranges or column types");
1191     my $aln;
1192
1193     if ($args[0][0] =~ /^[a-z_]+$/i) {
1194         $aln = $self->_remove_columns_by_type($args[0]);
1195     } elsif ($args[0][0] =~ /^\d+$/) {
1196         $aln = $self->_remove_columns_by_num(\@args);
1197     } else {
1198         $self->throw("You must pass array references to remove_columns(), not @args");
1199     }
1200     # fix for meta, sf, ann
1201     $aln;
1202 }
1203
1204
1205 =head2 remove_gaps
1206
1207  Title     : remove_gaps
1208  Usage     : $aln2 = $aln->remove_gaps
1209  Function  : Creates an aligment with gaps removed
1210  Returns   : a Bio::SimpleAlign object
1211  Args      : a gap character(optional) if none specified taken
1212                 from $self->gap_char,
1213              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214                         indicates that only all-gaps columns should be deleted
1215
1216 Used from method L<remove_columns> in most cases. Set gap character
1217 using L<gap_char()|gap_char>.
1218
1219 =cut
1220
1221 sub remove_gaps {
1222     my ($self,$gapchar,$all_gaps_columns) = @_;
1223     my $gap_line;
1224     if ($all_gaps_columns) {
1225         $gap_line = $self->all_gap_line($gapchar);
1226     } else {
1227         $gap_line = $self->gap_line($gapchar);
1228     }
1229     my $aln = $self->new;
1230
1231     my @remove;
1232     my $length = 0;
1233     my $del_char = $gapchar || $self->gap_char;
1234     # Do the matching to get the segments to remove
1235     while ($gap_line =~ m/[$del_char]/g) {
1236         my $start = pos($gap_line)-1;
1237         $gap_line =~ m/\G[$del_char]+/gc;
1238         my $end = pos($gap_line)-1;
1239
1240         #have to offset the start and end for subsequent removes
1241         $start-=$length;
1242         $end  -=$length;
1243         $length += ($end-$start+1);
1244         push @remove, [$start,$end];
1245     }
1246
1247     #remove the segments
1248     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249     # fix for meta, sf, ann
1250     return $aln;
1251 }
1252
1253
1254 sub _remove_col {
1255     my ($self,$aln,$remove) = @_;
1256     my @new;
1257
1258     my $gap = $self->gap_char;
1259
1260     # splice out the segments and create new seq
1261     foreach my $seq($self->each_seq){
1262         my $new_seq = Bio::LocatableSeq->new(
1263                                              -id      => $seq->id,
1264                                              -alphabet=> $seq->alphabet,
1265                                              -strand  => $seq->strand,
1266                                              -verbose => $self->verbose);
1267         my $sequence = $seq->seq;
1268         foreach my $pair(@{$remove}){
1269             my $start = $pair->[0];
1270             my $end   = $pair->[1];
1271             $sequence = $seq->seq unless $sequence;
1272             my $orig = $sequence;
1273             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1274             my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275             $sequence = $head.$tail;
1276             # start
1277             unless (defined $new_seq->start) {
1278                 if ($start == 0) {
1279                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1281                 }
1282                 else {
1283                     my $start_adjust = $orig =~ /^$gap+/;
1284                     if ($start_adjust) {
1285                         $start_adjust = $+[0] == $start;
1286                     }
1287                     $new_seq->start($seq->start + $start_adjust);
1288                 }
1289             }
1290             # end
1291             if (($end + 1) >= CORE::length($orig)) {
1292                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293                 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294             }
1295             else {
1296                 $new_seq->end($seq->end);
1297             }
1298         }
1299
1300         if ($new_seq->end < $new_seq->start) {
1301             # we removed all columns except for gaps: set to 0 to indicate no
1302             # sequence
1303             $new_seq->start(0);
1304             $new_seq->end(0);
1305         }
1306
1307         $new_seq->seq($sequence) if $sequence;
1308                 push @new, $new_seq;
1309     }
1310     # add the new seqs to the alignment
1311     foreach my $new(@new){
1312         $aln->add_seq($new);
1313     }
1314     # fix for meta, sf, ann
1315     return $aln;
1316 }
1317
1318 sub _remove_columns_by_type {
1319         my ($self,$type) = @_;
1320         my $aln = $self->new;
1321         my @remove;
1322
1323         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1324         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1325         my %matchchars = ( 'match'           => '\*',
1326                        'weak'             => '\.',
1327                        'strong'           => ':',
1328                        'mismatch'         => ' ',
1329                        'gaps'             => '',
1330                        'all_gaps_columns' => ''
1331                      );
1332         # get the characters to delete against
1333         my $del_char;
1334         foreach my $type (@{$type}){
1335                 $del_char.= $matchchars{$type};
1336         }
1337
1338         my $length = 0;
1339         my $match_line = $self->match_line;
1340         # do the matching to get the segments to remove
1341         if($del_char){
1342                 while($match_line =~ m/[$del_char]/g ){
1343                         my $start = pos($match_line)-1;
1344                         $match_line=~/\G[$del_char]+/gc;
1345                         my $end = pos($match_line)-1;
1346
1347                         #have to offset the start and end for subsequent removes
1348                         $start-=$length;
1349                         $end  -=$length;
1350                         $length += ($end-$start+1);
1351                         push @remove, [$start,$end];
1352                 }
1353         }
1354
1355         # remove the segments
1356         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357         $aln = $aln->remove_gaps() if $gap;
1358         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359     # fix for meta, sf, ann
1360         $aln;
1361 }
1362
1363
1364 sub _remove_columns_by_num {
1365         my ($self,$positions) = @_;
1366         my $aln = $self->new;
1367
1368         # sort the positions
1369         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1370
1371     my @remove;
1372     my $length = 0;
1373     foreach my $pos (@{$positions}) {
1374         my ($start, $end) = @{$pos};
1375
1376         #have to offset the start and end for subsequent removes
1377         $start-=$length;
1378         $end  -=$length;
1379         $length += ($end-$start+1);
1380         push @remove, [$start,$end];
1381     }
1382
1383     #remove the segments
1384     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385     # fix for meta, sf, ann
1386         $aln;
1387 }
1388
1389
1390 =head1 Change sequences within the MSA
1391
1392 These methods affect characters in all sequences without changing the
1393 alignment.
1394
1395 =head2 splice_by_seq_pos
1396
1397  Title   : splice_by_seq_pos
1398  Usage   : $status = splice_by_seq_pos(1);
1399  Function: splices all aligned sequences where the specified sequence
1400            has gaps.
1401  Example :
1402  Returns : 1 on success
1403  Args    : position of sequence to splice by
1404
1405
1406 =cut
1407
1408 sub splice_by_seq_pos{
1409   my ($self,$pos) = @_;
1410
1411   my $guide = $self->get_seq_by_pos($pos);
1412   my $guide_seq = $guide->seq;
1413
1414   $guide_seq =~ s/\./\-/g;
1415
1416   my @gaps = ();
1417   $pos = -1;
1418   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419     unshift @gaps, $pos;
1420     $pos++;
1421   }
1422
1423   foreach my $seq ($self->each_seq){
1424     my @bases = split '', $seq->seq;
1425
1426     splice(@bases, $_, 1) foreach @gaps;
1427     $seq->seq(join('', @bases));
1428   }
1429
1430   1;
1431 }
1432
1433 =head2 map_chars
1434
1435  Title     : map_chars
1436  Usage     : $ali->map_chars('\.','-')
1437  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438              characters.
1439
1440              Note that the first argument is interpreted as a regexp
1441              so be careful and escape any wild card characters (e.g.
1442              do $ali->map_chars('\.','-') to replace periods with dashes.
1443  Returns   : 1 on success
1444  Argument  : A regexp and a string
1445
1446 =cut
1447
1448 sub map_chars {
1449     my $self = shift;
1450     my $from = shift;
1451     my $to   = shift;
1452     my ( $seq, $temp );
1453
1454     $self->throw("Need two arguments: a regexp and a string")
1455       unless defined $from and defined $to;
1456
1457     foreach $seq ( $self->each_seq() ) {
1458         $temp = $seq->seq();
1459         $temp =~ s/$from/$to/g;
1460         $seq->seq($temp);
1461     }
1462     return 1;
1463 }
1464
1465
1466 =head2 uppercase
1467
1468  Title     : uppercase()
1469  Usage     : $ali->uppercase()
1470  Function  : Sets all the sequences to uppercase
1471  Returns   : 1 on success
1472  Argument  :
1473
1474 =cut
1475
1476 sub uppercase {
1477     my $self = shift;
1478     my $seq;
1479     my $temp;
1480
1481     foreach $seq ( $self->each_seq() ) {
1482       $temp = $seq->seq();
1483       $temp =~ tr/[a-z]/[A-Z]/;
1484
1485       $seq->seq($temp);
1486     }
1487     return 1;
1488 }
1489
1490 =head2 cigar_line
1491
1492  Title    : cigar_line()
1493  Usage    : %cigars = $align->cigar_line()
1494  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495             Report) line for each sequence in the alignment. Examples are
1496             "1,60" or "5,10:12,58", where the numbers refer to conserved
1497             positions within the alignment. The keys of the hash are the
1498             NSEs (name/start/end) assigned to each sequence.
1499  Args     : threshold (optional, defaults to 100)
1500  Returns  : Hash of strings (cigar lines)
1501
1502 =cut
1503
1504 sub cigar_line {
1505         my $self = shift;
1506         my $thr=shift||100;
1507         my %cigars;
1508
1509         my @consensus = split "",($self->consensus_string($thr));
1510         my $len = $self->length;
1511         my $gapchar = $self->gap_char;
1512
1513         # create a precursor, something like (1,4,5,6,7,33,45),
1514         # where each number corresponds to a conserved position
1515         foreach my $seq ( $self->each_seq ) {
1516                 my @seq = split "", uc ($seq->seq);
1517                 my $pos = 1;
1518                 for (my $x = 0 ; $x < $len ; $x++ ) {
1519                         if ($seq[$x] eq $consensus[$x]) {
1520                                 push @{$cigars{$seq->get_nse}},$pos;
1521                                 $pos++;
1522                         } elsif ($seq[$x] ne $gapchar) {
1523                                 $pos++;
1524                         }
1525                 }
1526         }
1527         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528         for my $name (keys %cigars) {
1529                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1530                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1531       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1532            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1533                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1534                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1535                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1536                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1537                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1538                         }
1539       }
1540         }
1541   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542   for my $name (keys %cigars) {
1543           my @remove;
1544           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1545                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1546                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1547                       unshift @remove,$x;
1548               }
1549            }
1550       for my $pos (@remove) {
1551                         splice @{$cigars{$name}}, $pos, 1;
1552            }
1553    }
1554    # join and punctuate
1555    for my $name (keys %cigars) {
1556           my ($start,$end,$str) = "";
1557           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1558                   $str .= ($start . "," . $end . ":");
1559           }
1560           $str =~ s/:$//;
1561       $cigars{$name} = $str;
1562    }
1563    %cigars;
1564 }
1565
1566
1567 =head2 match_line
1568
1569  Title    : match_line()
1570  Usage    : $line = $align->match_line()
1571  Function : Generates a match line - much like consensus string
1572             except that a line indicating the '*' for a match.
1573  Args     : (optional) Match line characters ('*' by default)
1574             (optional) Strong match char (':' by default)
1575             (optional) Weak match char ('.' by default)
1576  Returns  : String
1577
1578 =cut
1579
1580 sub match_line {
1581         my ($self,$matchlinechar, $strong, $weak) = @_;
1582         my %matchchars = ('match'    => $matchlinechar || '*',
1583                                                           'weak'     => $weak          || '.',
1584                                                           'strong'   => $strong        || ':',
1585                                                           'mismatch' => ' ',
1586                                                   );
1587
1588         my @seqchars;
1589         my $alphabet;
1590         foreach my $seq ( $self->each_seq ) {
1591                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592                 $alphabet = $seq->alphabet unless defined $alphabet;
1593         }
1594         my $refseq = shift @seqchars;
1595         # let's just march down the columns
1596         my $matchline;
1597  POS:
1598         foreach my $pos ( 0..$self->length ) {
1599                 my $refchar = $refseq->[$pos];
1600                 my $char = $matchchars{'mismatch'};
1601                 unless( defined $refchar ) {
1602                         last if $pos == $self->length; # short circuit on last residue
1603                         # this in place to handle jason's soon-to-be-committed
1604                         # intron mapping code
1605                         goto bottom;
1606                 }
1607                 my %col = ($refchar => 1);
1608                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609                 foreach my $seq ( @seqchars ) {
1610                         next if $pos >= scalar @$seq;
1611                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1612                                                           $seq->[$pos] eq ' ' );
1613                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1614                 }
1615                 my @colresidues = sort keys %col;
1616
1617                 # if all the values are the same
1618                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1619                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621                         # matches for protein seqs
1622             TYPE:
1623                         foreach my $type ( qw(strong weak) ) {
1624                                 # iterate through categories
1625                                 my %groups;
1626                                 # iterate through each of the aa in the col
1627                                 # look to see which groups it is in
1628                                 foreach my $c ( @colresidues ) {
1629                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1630                                                 push @{$groups{$f}},$c;
1631                                         }
1632                                 }
1633                          GRP:
1634                                 foreach my $cols ( values %groups ) {
1635                                         @$cols = sort @$cols;
1636                                         # now we are just testing to see if two arrays
1637                                         # are identical w/o changing either one
1638                                         # have to be same len
1639                                         next if( scalar @$cols != scalar @colresidues );
1640                                         # walk down the length and check each slot
1641                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1642                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1643                                         }
1644                                         $char = $matchchars{$type};
1645                                         last TYPE;
1646                                 }
1647                         }
1648                 }
1649          bottom:
1650                 $matchline .= $char;
1651         }
1652         return $matchline;
1653 }
1654
1655
1656 =head2 gap_line
1657
1658  Title    : gap_line()
1659  Usage    : $line = $align->gap_line()
1660  Function : Generates a gap line - much like consensus string
1661             except that a line where '-' represents gap
1662  Args     : (optional) gap line characters ('-' by default)
1663  Returns  : string
1664
1665 =cut
1666
1667 sub gap_line {
1668     my ($self,$gapchar) = @_;
1669     $gapchar = $gapchar || $self->gap_char;
1670     my %gap_hsh; # column gaps vector
1671     foreach my $seq ( $self->each_seq ) {
1672                 my $i = 0;
1673         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1674                   map {[$i++, $_]} split(//, uc ($seq->seq));
1675     }
1676     my $gap_line;
1677     foreach my $pos ( 0..$self->length-1 ) {
1678           $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1679     }
1680     return $gap_line;
1681 }
1682
1683 =head2 all_gap_line
1684
1685  Title    : all_gap_line()
1686  Usage    : $line = $align->all_gap_line()
1687  Function : Generates a gap line - much like consensus string
1688             except that a line where '-' represents all-gap column
1689  Args     : (optional) gap line characters ('-' by default)
1690  Returns  : string
1691
1692 =cut
1693
1694 sub all_gap_line {
1695     my ($self,$gapchar) = @_;
1696     $gapchar = $gapchar || $self->gap_char;
1697     my %gap_hsh;                # column gaps counter hash
1698     my @seqs = $self->each_seq;
1699     foreach my $seq ( @seqs ) {
1700         my $i = 0;
1701         map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1702         map {[$i++, $_]} split(//, uc ($seq->seq));
1703     }
1704     my $gap_line;
1705     foreach my $pos ( 0..$self->length-1 ) {
1706         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707             # gaps column
1708             $gap_line .= $self->gap_char;
1709         } else {
1710             $gap_line .= '.';
1711         }
1712     }
1713     return $gap_line;
1714 }
1715
1716 =head2 gap_col_matrix
1717
1718  Title    : gap_col_matrix()
1719  Usage    : my $cols = $align->gap_col_matrix()
1720  Function : Generates an array where each element in the array is a
1721             hash reference with a key of the sequence name and a
1722             value of 1 if the sequence has a gap at that column
1723  Returns  : Reference to an array
1724  Args     : Optional: gap line character ($aln->gap_char or '-' by default)
1725
1726 =cut
1727
1728 sub gap_col_matrix {
1729     my ( $self, $gapchar ) = @_;
1730     $gapchar = $gapchar || $self->gap_char;
1731     my %gap_hsh;    # column gaps vector
1732     my @cols;
1733     foreach my $seq ( $self->each_seq ) {
1734         my $i   = 0;
1735         my $str = $seq->seq;
1736         my $len = $seq->length;
1737         my $ch;
1738         my $id = $seq->display_id;
1739         while ( $i < $len ) {
1740             $ch = substr( $str, $i, 1 );
1741             $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1742         }
1743     }
1744     return \@cols;
1745 }
1746
1747 =head2 match
1748
1749  Title     : match()
1750  Usage     : $ali->match()
1751  Function  : Goes through all columns and changes residues that are
1752              identical to residue in first sequence to match '.'
1753              character. Sets match_char.
1754
1755              USE WITH CARE: Most MSA formats do not support match
1756              characters in sequences, so this is mostly for output
1757              only. NEXUS format (Bio::AlignIO::nexus) can handle
1758              it.
1759  Returns   : 1 on success
1760  Argument  : a match character, optional, defaults to '.'
1761
1762 =cut
1763
1764 sub match {
1765     my ( $self, $match ) = @_;
1766
1767     $match ||= '.';
1768     my ($matching_char) = $match;
1769     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/;    #';
1770     $self->map_chars( $matching_char, '-' );
1771
1772     my @seqs = $self->each_seq();
1773     return 1 unless scalar @seqs > 1;
1774
1775     my $refseq  = shift @seqs;
1776     my @refseq  = split //, $refseq->seq;
1777     my $gapchar = $self->gap_char;
1778
1779     foreach my $seq (@seqs) {
1780         my @varseq = split //, $seq->seq();
1781         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782             $varseq[$i] = $match
1783                 if defined $refseq[$i]
1784                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1785                 || $refseq[$i] =~ /$gapchar/ )
1786                 && $refseq[$i] eq $varseq[$i];
1787         }
1788         $seq->seq( join '', @varseq );
1789     }
1790     $self->match_char($match);
1791     return 1;
1792 }
1793
1794
1795
1796 =head2 unmatch
1797
1798  Title     : unmatch()
1799  Usage     : $ali->unmatch()
1800  Function  : Undoes the effect of method match. Unsets match_char.
1801  Returns   : 1 on success
1802  Argument  : a match character, optional, defaults to '.'
1803
1804 See L<match> and L<match_char>
1805
1806 =cut
1807
1808 sub unmatch {
1809     my ( $self, $match ) = @_;
1810
1811     $match ||= '.';
1812
1813     my @seqs = $self->each_seq();
1814     return 1 unless scalar @seqs > 1;
1815
1816     my $refseq  = shift @seqs;
1817     my @refseq  = split //, $refseq->seq;
1818     my $gapchar = $self->gap_char;
1819     foreach my $seq (@seqs) {
1820         my @varseq = split //, $seq->seq();
1821         for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822             $varseq[$i] = $refseq[$i]
1823                 if defined $refseq[$i]
1824                 && ( $refseq[$i] =~ /[A-Za-z\*]/
1825                 || $refseq[$i] =~ /$gapchar/ )
1826                 && $varseq[$i] eq $match;
1827         }
1828         $seq->seq( join '', @varseq );
1829     }
1830     $self->match_char('');
1831     return 1;
1832 }
1833
1834
1835 =head1 MSA attributes
1836
1837 Methods for setting and reading the MSA attributes.
1838
1839 Note that the methods defining character semantics depend on the user
1840 to set them sensibly.  They are needed only by certain input/output
1841 methods. Unset them by setting to an empty string ('').
1842
1843 =head2 id
1844
1845  Title     : id
1846  Usage     : $myalign->id("Ig")
1847  Function  : Gets/sets the id field of the alignment
1848  Returns   : An id string
1849  Argument  : An id string (optional)
1850
1851 =cut
1852
1853 sub id {
1854     my ( $self, $name ) = @_;
1855
1856     if ( defined($name) ) {
1857         $self->{'_id'} = $name;
1858     }
1859
1860     return $self->{'_id'};
1861 }
1862
1863 =head2 accession
1864
1865  Title     : accession
1866  Usage     : $myalign->accession("PF00244")
1867  Function  : Gets/sets the accession field of the alignment
1868  Returns   : An acc string
1869  Argument  : An acc string (optional)
1870
1871 =cut
1872
1873 sub accession {
1874     my ( $self, $acc ) = @_;
1875
1876     if ( defined($acc) ) {
1877         $self->{'_accession'} = $acc;
1878     }
1879
1880     return $self->{'_accession'};
1881 }
1882
1883 =head2 description
1884
1885  Title     : description
1886  Usage     : $myalign->description("14-3-3 proteins")
1887  Function  : Gets/sets the description field of the alignment
1888  Returns   : An description string
1889  Argument  : An description string (optional)
1890
1891 =cut
1892
1893 sub description {
1894     my ( $self, $name ) = @_;
1895
1896     if ( defined($name) ) {
1897         $self->{'_description'} = $name;
1898     }
1899
1900     return $self->{'_description'};
1901 }
1902
1903 =head2 missing_char
1904
1905  Title     : missing_char
1906  Usage     : $myalign->missing_char("?")
1907  Function  : Gets/sets the missing_char attribute of the alignment
1908              It is generally recommended to set it to 'n' or 'N'
1909              for nucleotides and to 'X' for protein.
1910  Returns   : An missing_char string,
1911  Argument  : An missing_char string (optional)
1912
1913 =cut
1914
1915 sub missing_char {
1916     my ( $self, $char ) = @_;
1917
1918     if ( defined $char ) {
1919         $self->throw("Single missing character, not [$char]!")
1920             if CORE::length($char) > 1;
1921         $self->{'_missing_char'} = $char;
1922     }
1923
1924     return $self->{'_missing_char'};
1925 }
1926
1927
1928 =head2 match_char
1929
1930  Title     : match_char
1931  Usage     : $myalign->match_char('.')
1932  Function  : Gets/sets the match_char attribute of the alignment
1933  Returns   : An match_char string,
1934  Argument  : An match_char string (optional)
1935
1936 =cut
1937
1938 sub match_char {
1939     my ( $self, $char ) = @_;
1940
1941     if ( defined $char ) {
1942         $self->throw("Single match character, not [$char]!")
1943             if CORE::length($char) > 1;
1944         $self->{'_match_char'} = $char;
1945     }
1946
1947     return $self->{'_match_char'};
1948 }
1949
1950 =head2 gap_char
1951
1952  Title     : gap_char
1953  Usage     : $myalign->gap_char('-')
1954  Function  : Gets/sets the gap_char attribute of the alignment
1955  Returns   : An gap_char string, defaults to '-'
1956  Argument  : An gap_char string (optional)
1957
1958 =cut
1959
1960 sub gap_char {
1961     my ( $self, $char ) = @_;
1962
1963     if ( defined $char || !defined $self->{'_gap_char'} ) {
1964         $char = '-' unless defined $char;
1965         $self->throw("Single gap character, not [$char]!")
1966             if CORE::length($char) > 1;
1967         $self->{'_gap_char'} = $char;
1968     }
1969     return $self->{'_gap_char'};
1970 }
1971
1972
1973 =head2 symbol_chars
1974
1975  Title   : symbol_chars
1976  Usage   : my @symbolchars = $aln->symbol_chars;
1977  Function: Returns all the seen symbols (other than gaps)
1978  Returns : array of characters that are the seen symbols
1979  Args    : boolean to include the gap/missing/match characters
1980
1981 =cut
1982
1983 sub symbol_chars{
1984    my ($self,$includeextra) = @_;
1985
1986    unless ($self->{'_symbols'}) {
1987        foreach my $seq ($self->each_seq) {
1988            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1989        }
1990    }
1991    my %copy = %{$self->{'_symbols'}};
1992    if( ! $includeextra ) {
1993        foreach my $char ( $self->gap_char, $self->match_char,
1994                           $self->missing_char) {
1995            delete $copy{$char} if( defined $char );
1996        }
1997    }
1998    return keys %copy;
1999 }
2000
2001 =head1 Alignment descriptors
2002
2003 These read only methods describe the MSA in various ways.
2004
2005
2006 =head2 score
2007
2008  Title     : score
2009  Usage     : $str = $ali->score()
2010  Function  : get/set a score of the alignment
2011  Returns   : a score for the alignment
2012  Argument  : an optional score to set
2013
2014 =cut
2015
2016 sub score {
2017   my $self = shift;
2018   $self->{score} = shift if @_;
2019   return $self->{score};
2020 }
2021
2022 =head2 consensus_string
2023
2024  Title     : consensus_string
2025  Usage     : $str = $ali->consensus_string($threshold_percent)
2026  Function  : Makes a strict consensus
2027  Returns   : Consensus string
2028  Argument  : Optional threshold ranging from 0 to 100.
2029              The consensus residue has to appear at least threshold %
2030              of the sequences at a given location, otherwise a '?'
2031              character will be placed at that location.
2032              (Default value = 0%)
2033
2034 =cut
2035
2036 sub consensus_string {
2037     my $self      = shift;
2038     my $threshold = shift;
2039
2040     my $out = "";
2041     my $len = $self->length - 1;
2042
2043     foreach ( 0 .. $len ) {
2044         $out .= $self->_consensus_aa( $_, $threshold );
2045     }
2046     return $out;
2047 }
2048
2049
2050 =head2 consensus_conservation
2051
2052  Title     : consensus_conservation
2053  Usage     : @conservation = $ali->consensus_conservation();
2054  Function  : Conservation (as a percent) of each position of alignment
2055  Returns   : Array of percentages [0-100]. Gap columns are 0% conserved.
2056  Argument  :
2057
2058 =cut
2059
2060 sub consensus_conservation {
2061     my $self = shift;
2062     my @cons;
2063     my $num_sequences = $self->num_sequences;
2064     foreach my $point (0..$self->length-1) {
2065         my %hash = $self->_consensus_counts($point);
2066         # max frequency of a non-gap letter
2067         my $max = (sort {$b<=>$a} values %hash )[0];
2068         push @cons, 100 * $max / $num_sequences;
2069     }
2070     return @cons;
2071 }
2072
2073 sub _consensus_aa {
2074     my $self = shift;
2075     my $point = shift;
2076     my $threshold_percent = shift || -1 ;
2077     my ($seq,%hash,$count,$letter,$key);
2078     my $gapchar = $self->gap_char;
2079     %hash = $self->_consensus_counts($point);
2080     my $number_of_sequences = $self->num_sequences();
2081     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082     $count = -1;
2083     $letter = '?';
2084
2085     foreach $key ( sort keys %hash ) {
2086         # print "Now at $key $hash{$key}\n";
2087         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088             $letter = $key;
2089             $count = $hash{$key};
2090         }
2091     }
2092     return $letter;
2093 }
2094
2095 # Frequency of each letter in one column
2096 sub _consensus_counts {
2097     my $self = shift;
2098     my $point = shift;
2099     my %hash;
2100     my $gapchar = $self->gap_char;
2101     foreach my $seq ( $self->each_seq() ) {
2102         my $letter = substr($seq->seq,$point,1);
2103         $self->throw("--$point-----------") if $letter eq '';
2104         ($letter eq $gapchar || $letter =~ /\./) && next;
2105         $hash{$letter}++;
2106     }
2107     return %hash;
2108 }
2109
2110
2111 =head2 consensus_iupac
2112
2113  Title     : consensus_iupac
2114  Usage     : $str = $ali->consensus_iupac()
2115  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2116              and RNA. The output is in upper case except when gaps in
2117              a column force output to be in lower case.
2118
2119              Note that if your alignment sequences contain a lot of
2120              IUPAC ambiquity codes you often have to manually set
2121              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2122              indicate a protein sequence.
2123  Returns   : consensus string
2124  Argument  : none
2125  Throws    : on protein sequences
2126
2127 =cut
2128
2129 sub consensus_iupac {
2130     my $self = shift;
2131     my $out  = "";
2132     my $len  = $self->length - 1;
2133
2134     # only DNA and RNA sequences are valid
2135     foreach my $seq ( $self->each_seq() ) {
2136         $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137             if $seq->alphabet eq 'protein';
2138     }
2139
2140     # loop over the alignment columns
2141     foreach my $count ( 0 .. $len ) {
2142         $out .= $self->_consensus_iupac($count);
2143     }
2144     return $out;
2145 }
2146
2147 sub _consensus_iupac {
2148     my ($self, $column) = @_;
2149     my ($string, $char, $rna);
2150
2151     #determine all residues in a column
2152     foreach my $seq ( $self->each_seq() ) {
2153         $string .= substr($seq->seq, $column, 1);
2154     }
2155     $string = uc $string;
2156
2157     # quick exit if there's an N in the string
2158     if ($string =~ /N/) {
2159         $string =~ /\W/ ? return 'n' : return 'N';
2160     }
2161     # ... or if there are only gap characters
2162     return '-' if $string =~ /^\W+$/;
2163
2164     # treat RNA as DNA in regexps
2165     if ($string =~ /U/) {
2166         $string =~ s/U/T/;
2167         $rna = 1;
2168     }
2169
2170     # the following s///'s only need to be done to the _first_ ambiguity code
2171     # as we only need to see the _range_ of characters in $string
2172
2173     if ($string =~ /[VDHB]/) {
2174         $string =~ s/V/AGC/;
2175         $string =~ s/D/AGT/;
2176         $string =~ s/H/ACT/;
2177         $string =~ s/B/CTG/;
2178     }
2179
2180     if ($string =~ /[SKYRWM]/) {
2181         $string =~ s/S/GC/;
2182         $string =~ s/K/GT/;
2183         $string =~ s/Y/CT/;
2184         $string =~ s/R/AG/;
2185         $string =~ s/W/AT/;
2186         $string =~ s/M/AC/;
2187     }
2188
2189     # and now the guts of the thing
2190
2191     if ($string =~ /A/) {
2192         $char = 'A';                     # A                      A
2193         if ($string =~ /G/) {
2194             $char = 'R';                 # A and G (purines)      R
2195             if ($string =~ /C/) {
2196                 $char = 'V';             # A and G and C          V
2197                 if ($string =~ /T/) {
2198                     $char = 'N';         # A and G and C and T    N
2199                 }
2200             } elsif ($string =~ /T/) {
2201                 $char = 'D';             # A and G and T          D
2202             }
2203         } elsif ($string =~ /C/) {
2204             $char = 'M';                 # A and C                M
2205             if ($string =~ /T/) {
2206                 $char = 'H';             # A and C and T          H
2207             }
2208         } elsif ($string =~ /T/) {
2209             $char = 'W';                 # A and T                W
2210         }
2211     } elsif ($string =~ /C/) {
2212         $char = 'C';                     # C                      C
2213         if ($string =~ /T/) {
2214             $char = 'Y';                 # C and T (pyrimidines)  Y
2215             if ($string =~ /G/) {
2216                 $char = 'B';             # C and T and G          B
2217             }
2218         } elsif ($string =~ /G/) {
2219             $char = 'S';                 # C and G                S
2220         }
2221     } elsif ($string =~ /G/) {
2222         $char = 'G';                     # G                      G
2223         if ($string =~ /C/) {
2224             $char = 'S';                 # G and C                S
2225         } elsif ($string =~ /T/) {
2226             $char = 'K';                 # G and T                K
2227         }
2228     } elsif ($string =~ /T/) {
2229         $char = 'T';                     # T                      T
2230     }
2231
2232     $char = 'U' if $rna and $char eq 'T';
2233     $char = lc $char if $string =~ /\W/;
2234
2235     return $char;
2236 }
2237
2238
2239 =head2 consensus_meta
2240
2241  Title     : consensus_meta
2242  Usage     : $seqmeta = $ali->consensus_meta()
2243  Function  : Returns a Bio::Seq::Meta object containing the consensus
2244              strings derived from meta data analysis.
2245  Returns   : Bio::Seq::Meta
2246  Argument  : Bio::Seq::Meta
2247  Throws    : non-MetaI object
2248
2249 =cut
2250
2251 sub consensus_meta {
2252     my ($self, $meta) = @_;
2253     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2254         $self->throw('Not a Bio::Seq::MetaI object');
2255     }
2256     return $self->{'_aln_meta'} = $meta if $meta;
2257     return $self->{'_aln_meta'}
2258 }
2259
2260 =head2 is_flush
2261
2262  Title     : is_flush
2263  Usage     : if ( $ali->is_flush() )
2264  Function  : Tells you whether the alignment
2265            : is flush, i.e. all of the same length
2266  Returns   : 1 or 0
2267  Argument  :
2268
2269 =cut
2270
2271 sub is_flush {
2272     my ( $self, $report ) = @_;
2273     my $seq;
2274     my $length = (-1);
2275     my $temp;
2276
2277     foreach $seq ( $self->each_seq() ) {
2278         if ( $length == (-1) ) {
2279             $length = CORE::length( $seq->seq() );
2280             next;
2281         }
2282
2283         $temp = CORE::length( $seq->seq() );
2284         if ( $temp != $length ) {
2285             $self->warn(
2286                 "expecting $length not $temp from " . $seq->display_id )
2287                 if ($report);
2288             $self->debug(
2289                 "expecting $length not $temp from " . $seq->display_id );
2290             $self->debug( $seq->seq() . "\n" );
2291             return 0;
2292         }
2293     }
2294
2295     return 1;
2296 }
2297
2298
2299 =head2 length
2300
2301  Title     : length()
2302  Usage     : $len = $ali->length()
2303  Function  : Returns the maximum length of the alignment.
2304              To be sure the alignment is a block, use is_flush
2305  Returns   : Integer
2306  Argument  :
2307
2308 =cut
2309
2310 sub length_aln {
2311     my $self = shift;
2312     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2313     $self->length(@_);
2314 }
2315
2316 sub length {
2317     my $self = shift;
2318     my $seq;
2319     my $length = -1;
2320     my $temp;
2321
2322     foreach $seq ( $self->each_seq() ) {
2323         $temp = $seq->length();
2324         if( $temp > $length ) {
2325             $length = $temp;
2326         }
2327     }
2328
2329     return $length;
2330 }
2331
2332
2333 =head2 maxdisplayname_length
2334
2335  Title     : maxdisplayname_length
2336  Usage     : $ali->maxdisplayname_length()
2337  Function  : Gets the maximum length of the displayname in the
2338              alignment. Used in writing out various MSA formats.
2339  Returns   : integer
2340  Argument  :
2341
2342 =cut
2343
2344 sub maxname_length {
2345     my $self = shift;
2346     $self->deprecated("maxname_length - deprecated method.".
2347                       " Use maxdisplayname_length() instead.");
2348     $self->maxdisplayname_length();
2349 }
2350
2351 sub maxnse_length {
2352     my $self = shift;
2353     $self->deprecated("maxnse_length - deprecated method.".
2354                       " Use maxnse_length() instead.");
2355     $self->maxdisplayname_length();
2356 }
2357
2358 sub maxdisplayname_length {
2359     my $self    = shift;
2360     my $maxname = (-1);
2361     my ( $seq, $len );
2362
2363     foreach $seq ( $self->each_seq() ) {
2364         $len = CORE::length $self->displayname( $seq->get_nse() );
2365
2366         if ( $len > $maxname ) {
2367             $maxname = $len;
2368         }
2369     }
2370
2371     return $maxname;
2372 }
2373
2374 =head2 max_metaname_length
2375
2376  Title     : max_metaname_length
2377  Usage     : $ali->max_metaname_length()
2378  Function  : Gets the maximum length of the meta name tags in the
2379              alignment for the sequences and for the alignment.
2380              Used in writing out various MSA formats.
2381  Returns   : integer
2382  Argument  : None
2383
2384 =cut
2385
2386 sub max_metaname_length {
2387     my $self = shift;
2388     my $maxname = (-1);
2389     my ($seq,$len);
2390
2391     # check seq meta first
2392     for $seq ( $self->each_seq() ) {
2393         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2394         for my $mtag ($seq->meta_names) {
2395             $len = CORE::length $mtag;
2396             if( $len > $maxname ) {
2397                 $maxname = $len;
2398             }
2399         }
2400     }
2401
2402     # alignment meta
2403     for my $meta ($self->consensus_meta) {
2404         next unless $meta;
2405         for my $name ($meta->meta_names) {
2406             $len = CORE::length $name;
2407             if( $len > $maxname ) {
2408                 $maxname = $len;
2409             }
2410         }
2411     }
2412
2413     return $maxname;
2414 }
2415
2416 =head2 num_residues
2417
2418  Title     : num_residues
2419  Usage     : $no = $ali->num_residues
2420  Function  : number of residues in total in the alignment
2421  Returns   : integer
2422  Argument  :
2423  Note      : replaces no_residues()
2424
2425 =cut
2426
2427 sub num_residues {
2428     my $self  = shift;
2429     my $count = 0;
2430
2431     foreach my $seq ( $self->each_seq ) {
2432         my $str = $seq->seq();
2433
2434         $count += ( $str =~ s/[A-Za-z]//g );
2435     }
2436
2437     return $count;
2438 }
2439
2440 =head2 num_sequences
2441
2442  Title     : num_sequences
2443  Usage     : $depth = $ali->num_sequences
2444  Function  : number of sequence in the sequence alignment
2445  Returns   : integer
2446  Argument  : none
2447  Note      : replaces no_sequences()
2448
2449 =cut
2450
2451 sub num_sequences {
2452     my $self = shift;
2453     return scalar($self->each_seq);
2454 }
2455
2456 =head2 average_percentage_identity
2457
2458  Title   : average_percentage_identity
2459  Usage   : $id = $align->average_percentage_identity
2460  Function: The function uses a fast method to calculate the average
2461            percentage identity of the alignment
2462  Returns : The average percentage identity of the alignment
2463  Args    : None
2464  Notes   : This method implemented by Kevin Howe calculates a figure that is
2465            designed to be similar to the average pairwise identity of the
2466            alignment (identical in the absence of gaps), without having to
2467            explicitly calculate pairwise identities proposed by Richard Durbin.
2468            Validated by Ewan Birney ad Alex Bateman.
2469
2470 =cut
2471
2472 sub average_percentage_identity{
2473    my ($self,@args) = @_;
2474
2475    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2477
2478    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2479
2480    if (! $self->is_flush()) {
2481        $self->throw("All sequences in the alignment must be the same length");
2482    }
2483
2484    @seqs = $self->each_seq();
2485    $len = $self->length();
2486
2487    # load the each hash with correct keys for existence checks
2488
2489    for( my $index=0; $index < $len; $index++) {
2490        foreach my $letter (@alphabet) {
2491            $countHashes[$index]->{$letter} = 0;
2492        }
2493    }
2494    foreach my $seq (@seqs)  {
2495        my @seqChars = split //, $seq->seq();
2496        for( my $column=0; $column < @seqChars; $column++ ) {
2497            my $char = uc($seqChars[$column]);
2498            if (exists $countHashes[$column]->{$char}) {
2499                $countHashes[$column]->{$char}++;
2500            }
2501        }
2502    }
2503
2504    $total = 0;
2505    $divisor = 0;
2506    for(my $column =0; $column < $len; $column++) {
2507        my %hash = %{$countHashes[$column]};
2508        $subdivisor = 0;
2509        foreach my $res (keys %hash) {
2510            $total += $hash{$res}*($hash{$res} - 1);
2511            $subdivisor += $hash{$res};
2512        }
2513        $divisor += $subdivisor * ($subdivisor - 1);
2514    }
2515    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2516 }
2517
2518 =head2 percentage_identity
2519
2520  Title   : percentage_identity
2521  Usage   : $id = $align->percentage_identity
2522  Function: The function calculates the average percentage identity
2523            (aliased to average_percentage_identity)
2524  Returns : The average percentage identity
2525  Args    : None
2526
2527 =cut
2528
2529 sub percentage_identity {
2530     my $self = shift;
2531     return $self->average_percentage_identity();
2532 }
2533
2534 =head2 overall_percentage_identity
2535
2536  Title   : overall_percentage_identity
2537  Usage   : $id = $align->overall_percentage_identity
2538            $id = $align->overall_percentage_identity('short')
2539  Function: The function calculates the percentage identity of
2540            the conserved columns
2541  Returns : The percentage identity of the conserved columns
2542  Args    : length value to use, optional defaults to alignment length
2543                  possible values: 'align', 'short', 'long'
2544
2545 The argument values 'short' and 'long' refer to shortest and longest
2546 sequence in the alignment. Method modification code by Hongyu Zhang.
2547
2548 =cut
2549
2550 sub overall_percentage_identity{
2551    my ($self, $length_measure) = @_;
2552
2553    my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2554
2555    my %enum = map {$_ => undef} qw (align short long);
2556
2557    $self->throw("Unknown argument [$length_measure]")
2558        if $length_measure and not exists $enum{$length_measure};
2559    $length_measure ||= 'align';
2560
2561    if (! $self->is_flush()) {
2562        $self->throw("All sequences in the alignment must be the same length");
2563    }
2564
2565    # Count the residues seen at each position
2566    my $len;
2567    my $total = 0; # number of positions with identical residues
2568    my @countHashes;
2569    my @seqs = $self->each_seq;
2570    my $nof_seqs = scalar @seqs;
2571    my $aln_len = $self->length();
2572    for my $seq (@seqs)  {
2573        my $seqstr = $seq->seq;
2574
2575        # Count residues for given sequence
2576        for my $column (0 .. $aln_len-1) {
2577            my $char = uc( substr($seqstr, $column, 1) );
2578            if ( exists $alphabet{$char} ) {
2579
2580                # This is a valid char
2581                if ( defined $countHashes[$column]->{$char} ) {
2582                  $countHashes[$column]->{$char}++;
2583                } else {
2584                  $countHashes[$column]->{$char} = 1;
2585                }
2586
2587                if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588                    # All sequences have this same residue
2589                    $total++;
2590                }
2591
2592            }
2593        }
2594
2595        # Sequence length
2596        if ($length_measure eq 'short' || $length_measure eq 'long') {
2597            my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598            if ($length_measure eq 'short') {
2599                if ( (not defined $len) || ($seq_len < $len) ) {
2600                    $len = $seq_len;
2601                }
2602            } elsif ($length_measure eq 'long') {
2603                if ( (not defined $len) || ($seq_len > $len) ) {
2604                    $len = $seq_len;
2605                }
2606            }
2607        }
2608
2609    }
2610
2611    if ($length_measure eq 'align') {
2612        $len = $aln_len;
2613    }
2614
2615    return ($total / $len ) * 100.0;
2616 }
2617
2618
2619
2620 =head1 Alignment positions
2621
2622 Methods to map a sequence position into an alignment column and back.
2623 column_from_residue_number() does the former. The latter is really a
2624 property of the sequence object and can done using
2625 L<Bio::LocatableSeq::location_from_column>:
2626
2627     # select somehow a sequence from the alignment, e.g.
2628     my $seq = $aln->get_seq_by_pos(1);
2629     #$loc is undef or Bio::LocationI object
2630     my $loc = $seq->location_from_column(5);
2631
2632 =head2 column_from_residue_number
2633
2634  Title   : column_from_residue_number
2635  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636  Function: This function gives the position in the alignment
2637            (i.e. column number) of the given residue number in the
2638            sequence with the given name. For example, for the
2639            alignment
2640
2641              Seq1/91-97 AC..DEF.GH.
2642              Seq2/24-30 ACGG.RTY...
2643                 Seq3/43-51 AC.DDEF.GHI
2644
2645            column_from_residue_number( "Seq1", 94 ) returns 6.
2646            column_from_residue_number( "Seq2", 25 ) returns 2.
2647            column_from_residue_number( "Seq3", 50 ) returns 10.
2648
2649            An exception is thrown if the residue number would lie
2650            outside the length of the aligment
2651            (e.g. column_from_residue_number( "Seq2", 22 )
2652
2653           Note: If the the parent sequence is represented by more than
2654                 one alignment sequence and the residue number is present in
2655                 them, this method finds only the first one.
2656
2657  Returns : A column number for the position in the alignment of the
2658            given residue in the given sequence (1 = first column)
2659  Args    : A sequence id/name (not a name/start-end)
2660            A residue number in the whole sequence (not just that
2661            segment of it in the alignment)
2662
2663 =cut
2664
2665 sub column_from_residue_number {
2666     my ( $self, $name, $resnumber ) = @_;
2667
2668     $self->throw("No sequence with name [$name]")
2669         unless $self->{'_start_end_lists'}->{$name};
2670     $self->throw("Second argument residue number missing") unless $resnumber;
2671
2672     foreach my $seq ( $self->each_seq_with_id($name) ) {
2673         my $col;
2674         eval { $col = $seq->column_from_residue_number($resnumber); };
2675         next if $@;
2676         return $col;
2677     }
2678
2679     $self->throw( "Could not find a sequence segment in $name "
2680             . "containing residue number $resnumber" );
2681
2682 }
2683
2684 =head1 Sequence names
2685
2686 Methods to manipulate the display name. The default name based on the
2687 sequence id and subsequence positions can be overridden in various
2688 ways.
2689
2690 =head2 displayname
2691
2692  Title     : displayname
2693  Usage     : $myalign->displayname("Ig", "IgA")
2694  Function  : Gets/sets the display name of a sequence in the alignment
2695  Returns   : A display name string
2696  Argument  : name of the sequence
2697              displayname of the sequence (optional)
2698
2699 =cut
2700
2701 sub displayname {
2702     my ( $self, $name, $disname ) = @_;
2703
2704     $self->throw("No sequence with name [$name]")
2705         unless defined $self->{'_seq'}->{$name};
2706
2707     if ( $disname and $name ) {
2708         $self->{'_dis_name'}->{$name} = $disname;
2709         return $disname;
2710     }
2711     elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712         return $self->{'_dis_name'}->{$name};
2713     }
2714     else {
2715         return $name;
2716     }
2717 }
2718
2719 sub get_displayname {
2720     my $self = shift;
2721     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722     $self->displayname(@_);
2723 }
2724
2725 sub set_displayname {
2726     my $self = shift;
2727     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728     $self->displayname(@_);
2729 }
2730
2731
2732 =head2 set_displayname_count
2733
2734  Title     : set_displayname_count
2735  Usage     : $ali->set_displayname_count
2736  Function  : Sets the names to be name_# where # is the number of
2737              times this name has been used.
2738  Returns   : 1, on success
2739  Argument  :
2740
2741 =cut
2742
2743 sub set_displayname_count {
2744     my $self= shift;
2745     my (@arr,$name,$seq,$count,$temp,$nse);
2746
2747     foreach $seq ( $self->each_alphabetically() ) {
2748         $nse = $seq->get_nse();
2749
2750         #name will be set when this is the second
2751         #time (or greater) is has been seen
2752
2753         if( defined $name and $name eq ($seq->id()) ) {
2754             $temp = sprintf("%s_%s",$name,$count);
2755             $self->displayname($nse,$temp);
2756             $count++;
2757         } else {
2758             $count = 1;
2759             $name = $seq->id();
2760             $temp = sprintf("%s_%s",$name,$count);
2761             $self->displayname($nse,$temp);
2762             $count++;
2763         }
2764     }
2765     return 1;
2766 }
2767
2768 =head2 set_displayname_flat
2769
2770  Title     : set_displayname_flat
2771  Usage     : $ali->set_displayname_flat()
2772  Function  : Makes all the sequences be displayed as just their name,
2773              not name/start-end (NSE)
2774  Returns   : 1
2775  Argument  :
2776
2777 =cut
2778
2779 sub set_displayname_flat {
2780     my $self = shift;
2781     my ( $nse, $seq );
2782
2783     foreach $seq ( $self->each_seq() ) {
2784         $nse = $seq->get_nse();
2785         $self->displayname( $nse, $seq->id() );
2786     }
2787     return 1;
2788 }
2789
2790
2791 =head2 set_displayname_normal
2792
2793  Title     : set_displayname_normal
2794  Usage     : $ali->set_displayname_normal()
2795  Function  : Makes all the sequences be displayed as name/start-end (NSE)
2796  Returns   : 1, on success
2797  Argument  :
2798
2799 =cut
2800
2801 sub set_displayname_normal {
2802     my $self = shift;
2803     my ( $nse, $seq );
2804
2805     foreach $seq ( $self->each_seq() ) {
2806         $nse = $seq->get_nse();
2807         $self->displayname( $nse, $nse );
2808     }
2809     return 1;
2810 }
2811
2812 =head2 source
2813
2814  Title   : source
2815  Usage   : $obj->source($newval)
2816  Function: sets the Alignment source program
2817  Example :
2818  Returns : value of source
2819  Args    : newvalue (optional)
2820
2821 =cut
2822
2823 sub source {
2824     my ( $self, $value ) = @_;
2825     if ( defined $value ) {
2826         $self->{'_source'} = $value;
2827     }
2828     return $self->{'_source'};
2829 }
2830
2831
2832 =head2 set_displayname_safe
2833
2834  Title     : set_displayname_safe
2835  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836  Function  : Assign machine-generated serial names to sequences in input order.
2837              Designed to protect names during PHYLIP runs. Assign 10-char string
2838              in the form of "S000000001" to "S999999999". Restore the original
2839              names using "restore_displayname".
2840  Returns   : 1. a new $aln with system names;
2841              2. a hash ref for restoring names
2842  Argument  : Number for id length (default 10)
2843
2844 =cut
2845
2846 sub set_displayname_safe {
2847     my $self = shift;
2848     my $idlength = shift || 10;
2849     my ( $seq, %phylip_name );
2850     my $ct  = 0;
2851     my $new = Bio::SimpleAlign->new();
2852     foreach $seq ( $self->each_seq() ) {
2853         $ct++;
2854         my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855         $phylip_name{$pname} = $seq->id();
2856         my $new_seq = Bio::LocatableSeq->new(
2857             -id       => $pname,
2858             -seq      => $seq->seq(),
2859             -alphabet => $seq->alphabet,
2860             -start    => $seq->{_start},
2861             -end      => $seq->{_end}
2862         );
2863         $new->add_seq($new_seq);
2864     }
2865
2866     $self->debug(
2867         "$ct seq names changed. Restore names by using restore_displayname.");
2868     return ( $new, \%phylip_name );
2869 }
2870
2871
2872 =head2 restore_displayname
2873
2874  Title     : restore_displayname
2875  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876  Function  : Restore original sequence names (after running
2877              $ali->set_displayname_safe)
2878  Returns   : a new $aln with names restored.
2879  Argument  : a hash reference of names from "set_displayname_safe".
2880
2881 =cut
2882
2883 sub restore_displayname {
2884     my $self = shift;
2885     my $ref=shift;
2886     my %name=%$ref;
2887     my $new=Bio::SimpleAlign->new();
2888     foreach my $seq ( $self->each_seq() ) {
2889       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2890       my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891                                           -seq      => $seq->seq(),
2892                                           -alphabet => $seq->alphabet,
2893                                           -start    => $seq->{_start},
2894                                           -end      => $seq->{_end}
2895                                           );
2896       $new->add_seq($new_seq);
2897     }
2898     return $new;
2899 }
2900
2901 =head2 sort_by_start
2902
2903  Title     : sort_by_start
2904  Usage     : $ali->sort_by_start
2905  Function  : Changes the order of the alignment to the start position of each
2906              subalignment
2907  Returns   : 1 on success
2908  Argument  :
2909
2910 =cut
2911
2912 sub sort_by_start {
2913     my $self = shift;
2914     my ($seq,$nse,@arr,%hash,$count);
2915     foreach $seq ( $self->each_seq() ) {
2916         $nse = $seq->get_nse;
2917         $hash{$nse} = $seq;
2918     }
2919     $count = 0;
2920     %{$self->{'_order'}} = (); # reset the hash;
2921     foreach $nse ( sort _startend keys %hash) {
2922         $self->{'_order'}->{$count} = $nse;
2923         $count++;
2924     }
2925     1;
2926 }
2927
2928 sub _startend {
2929     my ($aname,$arange) = split (/[\/]/,$a);
2930     my ($bname,$brange) = split (/[\/]/,$b);
2931     my ($astart,$aend) = split(/\-/,$arange);
2932     my ($bstart,$bend) = split(/\-/,$brange);
2933     return $astart <=> $bstart;
2934 }
2935
2936 =head2 bracket_string
2937
2938  Title     : bracket_string
2939  Usage     : my @params = (-refseq     => 'testseq',
2940                            -allele1    => 'allele1',
2941                            -allele2    => 'allele2',
2942                            -delimiters => '{}',
2943                            -separator  => '/');
2944              $str = $aln->bracket_string(@params)
2945
2946  Function :  When supplied with a list of parameters (see below), returns a
2947              string in BIC format. This is used for allelic comparisons.
2948              Briefly, if either allele contains a base change when compared to
2949              the refseq, the base or gap for each allele is represented in
2950              brackets in the order present in the 'alleles' parameter.
2951
2952              For the following data:
2953
2954              >testseq
2955              GGATCCATTGCTACT
2956              >allele1
2957              GGATCCATTCCTACT
2958              >allele2
2959              GGAT--ATTCCTCCT
2960
2961              the returned string with parameters 'refseq => testseq' and
2962              'alleles => [qw(allele1 allele2)]' would be:
2963
2964              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965  Returns   : BIC-formatted string
2966  Argument  : Required args
2967                 refseq    : string (ID) of the reference sequence used
2968                             as basis for comparison
2969                 allele1   : string (ID) of the first allele
2970                 allele2   : string (ID) of the second allele
2971              Optional args
2972                 delimiters: two symbol string of left and right delimiters.
2973                             Only the first two symbols are used
2974                             default = '[]'
2975                 separator : string used as a separator.  Only the first
2976                             symbol is used
2977                             default = '/'
2978  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2979
2980 =cut
2981
2982 sub bracket_string {
2983     my ($self, @args) = @_;
2984     my ($ref, $a1, $a2, $delim, $sep) =
2985         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2987     my ($ld, $rd);
2988     ($ld, $rd) = split('', $delim, 2) if $delim;
2989     $ld ||= '[';
2990     $rd ||= ']';
2991     $sep ||= '/';
2992     my ($refseq, $allele1, $allele2) =
2993         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2994     if (!$refseq || !$allele1 || !$allele2) {
2995         $self->throw("One of your refseq/allele IDs is invalid!");
2996     }
2997     my $len = $self->length-1;
2998     my $bic = '';
2999     # loop over the alignment columns
3000     for my $column ( 0 .. $len ) {
3001         my $string;
3002         my ($compres, $res1, $res2) =
3003             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3004         # are any of the allele symbols different from the refseq?
3005         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006                 $ld.$res1.$sep.$res2.$rd;
3007         $bic .= $string;
3008     }
3009     return $bic;
3010 }
3011
3012
3013 =head2 methods implementing Bio::FeatureHolderI
3014
3015 FeatureHolderI implementation to support labeled character sets like one
3016 would get from NEXUS represented data.
3017
3018 =head2 get_SeqFeatures
3019
3020  Usage   : @features = $aln->get_SeqFeatures
3021  Function: Get the feature objects held by this feature holder.
3022  Example :
3023  Returns : an array of Bio::SeqFeatureI implementing objects
3024  Args    : optional filter coderef, taking a Bio::SeqFeatureI
3025          : as argument, returning TRUE if wanted, FALSE if
3026          : unwanted
3027
3028 =cut
3029
3030 sub get_SeqFeatures {
3031     my $self      = shift;
3032     my $filter_cb = shift;
3033     $self->throw("Arg (filter callback) must be a coderef")
3034         unless !defined($filter_cb)
3035         or ref($filter_cb) eq 'CODE';
3036     if ( !defined $self->{'_as_feat'} ) {
3037         $self->{'_as_feat'} = [];
3038     }
3039     if ($filter_cb) {
3040         return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
3041     }
3042     return @{ $self->{'_as_feat'} };
3043 }
3044
3045
3046 =head2 add_SeqFeature
3047
3048  Usage   : $aln->add_SeqFeature($subfeat);
3049  Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050            (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3051  Example :
3052  Returns : 1 on success
3053  Args    : a Bio::SeqFeatureI object
3054
3055 =cut
3056
3057 sub add_SeqFeature {
3058    my ($self, @feat) = @_;
3059
3060    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3061
3062    if (scalar @feat > 1) {
3063       $self->deprecated(
3064          -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065                      ' is deprecated and will be removed in a future version. '.
3066                      'Add a single feature at a time instead.',
3067          -warn_version    => 1.007,
3068          -throw_version   => 1.009,
3069       );
3070    }
3071
3072    for my $feat ( @feat ) {
3073
3074        next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075
3076        if( !$feat->isa("Bio::SeqFeatureI") ) {
3077            $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078        }
3079
3080        push @{$self->{'_as_feat'}}, $feat;
3081    }
3082    return 1;
3083 }
3084
3085
3086 =head2 remove_SeqFeatures
3087
3088  Usage   : $obj->remove_SeqFeatures
3089  Function: Removes all SeqFeatures.  If you want to remove only a subset,
3090            remove that subset from the returned array, and add back the rest.
3091  Returns : The array of Bio::SeqFeatureI features that was
3092            deleted from this alignment.
3093  Args    : none
3094
3095 =cut
3096
3097 sub remove_SeqFeatures {
3098     my $self = shift;
3099
3100     return () unless $self->{'_as_feat'};
3101     my @feats = @{$self->{'_as_feat'}};
3102     $self->{'_as_feat'} = [];
3103     return @feats;
3104 }
3105
3106 =head2 feature_count
3107
3108  Title   : feature_count
3109  Usage   : $obj->feature_count()
3110  Function: Return the number of SeqFeatures attached to the alignment
3111  Returns : integer representing the number of SeqFeatures
3112  Args    : None
3113
3114 =cut
3115
3116 sub feature_count {
3117     my ($self) = @_;
3118
3119     if (defined($self->{'_as_feat'})) {
3120         return ($#{$self->{'_as_feat'}} + 1);
3121     } else {
3122         return 0;
3123     }
3124 }
3125
3126 =head2 get_all_SeqFeatures
3127
3128  Title   : get_all_SeqFeatures
3129  Usage   :
3130  Function: Get all SeqFeatures.
3131  Example :
3132  Returns : an array of Bio::SeqFeatureI implementing objects
3133  Args    : none
3134  Note    : Falls through to Bio::FeatureHolderI implementation.
3135
3136 =cut
3137
3138 =head2 methods for Bio::AnnotatableI
3139
3140 AnnotatableI implementation to support sequence alignments which
3141 contain annotation (NEXUS, Stockholm).
3142
3143 =head2 annotation
3144
3145  Title   : annotation
3146  Usage   : $ann = $aln->annotation or
3147            $aln->annotation($ann)
3148  Function: Gets or sets the annotation
3149  Returns : Bio::AnnotationCollectionI object
3150  Args    : None or Bio::AnnotationCollectionI object
3151
3152 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3153 for more information
3154
3155 =cut
3156
3157 sub annotation {
3158     my ($obj,$value) = @_;
3159     if( defined $value ) {
3160         $obj->throw("object of class ".ref($value)." does not implement ".
3161                 "Bio::AnnotationCollectionI. Too bad.")
3162             unless $value->isa("Bio::AnnotationCollectionI");
3163         $obj->{'_annotation'} = $value;
3164     } elsif( ! defined $obj->{'_annotation'}) {
3165         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166     }
3167     return $obj->{'_annotation'};
3168 }
3169
3170 =head1 Deprecated methods
3171
3172 =cut
3173
3174 =head2 no_residues
3175
3176  Title     : no_residues
3177  Usage     : $no = $ali->no_residues
3178  Function  : number of residues in total in the alignment
3179  Returns   : integer
3180  Argument  :
3181  Note      : deprecated in favor of num_residues()
3182
3183 =cut
3184
3185 sub no_residues {
3186         my $self = shift;
3187         $self->deprecated(-warn_version => 1.0069,
3188                                           -throw_version => 1.0075,
3189                       -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190   $self->num_residues(@_);
3191 }
3192
3193 =head2 no_sequences
3194
3195  Title     : no_sequences
3196  Usage     : $depth = $ali->no_sequences
3197  Function  : number of sequence in the sequence alignment
3198  Returns   : integer
3199  Argument  :
3200  Note      : deprecated in favor of num_sequences()
3201
3202 =cut
3203
3204 sub no_sequences {
3205         my $self = shift;
3206         $self->deprecated(-warn_version => 1.0069,
3207                                           -throw_version => 1.0075,
3208                       -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209     $self->num_sequences(@_);
3210 }
3211
3212 =head2 mask_columns
3213
3214  Title     : mask_columns
3215  Usage     : $aln2 = $aln->mask_columns(20,30)
3216  Function  : Masks a slice of the alignment inclusive of start and
3217              end columns, and the first column in the alignment is denoted 1.
3218              Mask beyond the length of the sequence does not do padding.
3219  Returns   : A Bio::SimpleAlign object
3220  Args      : Positive integer for start column, positive integer for end column,
3221              optional string value use for the mask. Example:
3222
3223              $aln2 = $aln->mask_columns(20,30,'?')
3224  Note      : Masking must use a character that is not used for gaps or
3225              frameshifts.  These can be adjusted using the relevant global
3226              variables, but be aware these may be (uncontrollably) modified
3227              elsewhere within BioPerl (see bug 2715)
3228
3229 =cut
3230
3231 sub mask_columns {
3232     #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233     my $self = shift;
3234
3235     my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236              $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3237
3238     # coordinates are alignment-based, not sequence-based
3239     my ($start, $end, $mask_char) = @_;
3240     unless (defined $mask_char) { $mask_char = 'N' }
3241
3242     $self->throw("Mask start has to be a positive integer and less than ".
3243                  "alignment length, not [$start]")
3244       unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245     $self->throw("Mask end has to be a positive integer and less than ".
3246                  "alignment length, not [$end]")
3247       unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248     $self->throw("Mask start [$start] has to be smaller than or equal to ".
3249                  "end [$end]") unless $start <= $end;
3250     $self->throw("Mask character $mask_char has to be a single character ".
3251                  "and not a gap or frameshift symbol")
3252       unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3253
3254     my $aln = $self->new;
3255     $aln->id($self->id);
3256     foreach my $seq ( $self->each_seq() ) {
3257         my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258          -alphabet => $seq->alphabet,
3259          -strand  => $seq->strand,
3260          -verbose => $self->verbose);
3261
3262         # convert from 1-based alignment coords!
3263         my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264         $masked_string =~ s{[^$nonres]}{$mask_char}g;
3265         my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266         $new_seq->seq($new_dna_string);
3267         $aln->add_seq($new_seq);
3268     }
3269     # Preserve chosen mask character, it may be need later (like in 'slice')
3270     $aln->{_mask_char} = $mask_char;
3271     return $aln;
3272 }
3273
3274 1;