Bio/SimpleAlign.pm

   1 # $Id$
   2 # BioPerl module for SimpleAlign
   3 #
   4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   5 #
   6 # Copyright Ewan Birney
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 # POD documentation - main docs before the code
  11 #
  12 #  History:
  13 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  14 #       May 2001 major rewrite - Heikki Lehvaslaiho
  15
  16 =head1 NAME
  17
  18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  19
  20 =head1 SYNOPSIS
  21
  22   # Use Bio::AlignIO to read in the alignment
  23   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  24   $aln = $str->next_aln();
  25
  26   # Describe
  27   print $aln->length;
  28   print $aln->no_residues;
  29   print $aln->is_flush;
  30   print $aln->no_sequences;
  31   print $aln->score;
  32   print $aln->percentage_identity;
  33   print $aln->consensus_string(50);
  34
  35   # Find the position in the alignment for a sequence location
  36   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  37
  38   # Extract sequences and check values for the alignment column $pos
  39   foreach $seq ($aln->each_seq) {
  40       $res = $seq->subseq($pos, $pos);
  41       $count{$res}++;
  42   }
  43   foreach $res (keys %count) {
  44       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  45   }
  46
  47   # Manipulate
  48   $aln->remove_seq($seq);
  49   $mini_aln = $aln->slice(20,30);  # get a block of columns
  50   $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
  51   $new_aln = $aln->remove_columns([20,30]); # remove by position
  52   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  53
  54   # Analyze
  55   $str = $aln->consensus_string($threshold_percent);
  56   $str = $aln->match_line();
  57   $str = $aln->cigar_line();
  58   $id = $aln->percentage_identity;
  59
  60   # See the module documentation for details and more methods.
  61
  62 =head1 DESCRIPTION
  63
  64 SimpleAlign is an object that handles a multiple sequence alignment
  65 (MSA). It is very permissive of types (it does not insist on sequences
  66 being all same length, for example). Think of it as a set of sequences
  67 with a whole series of built-in manipulations and methods for reading and
  68 writing alignments.
  69
  70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  71 to store its sequences. These are subsequences with a start and end
  72 positions in the parent reference sequence. Each sequence in the
  73 SimpleAlign object is a Bio::LocatableSeq.
  74
  75 SimpleAlign expects the combination of name, start, and end for a
  76 given sequence to be unique in the alignment, and this is the key for the
  77 internal hashes (name, start, end are abbreviated C<nse> in the code).
  78 However, in some cases people do not want the name/start-end to be displayed:
  79 either multiple names in an alignment or names specific to the alignment
  80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  81 C<displayname>, and generally is what is used to print out the
  82 alignment. They default to name/start-end.
  83
  84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  85
  86 =head1 FEEDBACK
  87
  88 =head2 Mailing Lists
  89
  90 User feedback is an integral part of the evolution of this and other
  91 Bioperl modules.  Send your comments and suggestions preferably to one
  92 of the Bioperl mailing lists.  Your participation is much appreciated.
  93
  94   bioperl-l@bioperl.org                  - General discussion
  95   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via the
 101 web:
 102
 103   http://bugzilla.open-bio.org/
 104
 105 =head1 AUTHOR
 106
 107 Ewan Birney, birney@ebi.ac.uk
 108
 109 =head1 CONTRIBUTORS
 110
 111 Allen Day, allenday-at-ucla.edu,
 112 Richard Adams, Richard.Adams-at-ed.ac.uk,
 113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 115 Allen Smith, allens-at-cpan.org,
 116 Jason Stajich, jason-at-bioperl.org,
 117 Anthony Underwood, aunderwood-at-phls.org.uk,
 118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 119 Brian Osborne, bosborne at alum.mit.edu
 120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 121 Hongyu Zhang, forward at hongyu.org
 122
 123 =head1 SEE ALSO
 124
 125 L<Bio::LocatableSeq>
 126
 127 =head1 APPENDIX
 128
 129 The rest of the documentation details each of the object
 130 methods. Internal methods are usually preceded with a _
 131
 132 =cut
 133
 134 # 'Let the code begin...
 135
 136 package Bio::SimpleAlign;
 137 use vars qw(%CONSERVATION_GROUPS);
 138 use strict;
 139
 140 use Bio::LocatableSeq;  # uses Seq's as list
 141
 142 use Bio::Seq;
 143 use Bio::SeqFeature::Generic;
 144
 145 BEGIN {
 146     # This data should probably be in a more centralized module...
 147     # it is taken from Clustalw documentation.
 148     # These are all the positively scoring groups that occur in the
 149     # Gonnet Pam250 matrix. The strong and weak groups are
 150     # defined as strong score >0.5 and weak score =<0.5 respectively.
 151
 152     %CONSERVATION_GROUPS = (
 153             'strong' => [ qw(
 154                                                  STA
 155                                                  NEQK
 156                                                  NHQK
 157                                                  NDEQ
 158                                                  QHRK
 159                                                  MILV
 160                                                  MILF
 161                                                  HY
 162                                                  FYW )],
 163                                 'weak' => [ qw(
 164                       CSA
 165                                                ATV
 166                                                SAG
 167                                                STNK
 168                                                STPA
 169                                                SGND
 170                                                SNDEQK
 171                                                NDEQHK
 172                                                NEQHRK
 173                                                FVLIM
 174                                                HFY )],);
 175 }
 176
 177 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 178             Bio::FeatureHolderI);
 179
 180 =head2 new
 181
 182  Title     : new
 183  Usage     : my $aln = Bio::SimpleAlign->new();
 184  Function  : Creates a new simple align object
 185  Returns   : Bio::SimpleAlign
 186  Args      : -source     => string representing the source program
 187                             where this alignment came from
 188              -annotation => Bio::AnnotationCollectionI
 189              -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
 190              -seqs       => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
 191              -consensus  => consensus string
 192              -consensus_meta  => Bio::Seq::Meta object containing consensus met information (kludge)
 193
 194 =cut
 195
 196
 197 sub new {
 198   my($class,@args) = @_;
 199
 200   my $self = $class->SUPER::new(@args);
 201
 202   my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
 203                                             SOURCE
 204                                             SCORE
 205                                             ID
 206                                             ACCESSION
 207                                             DESCRIPTION
 208                                             SEQS
 209                                             FEATURES
 210                                             ANNOTATION
 211                                             SEQ_ANNOTATION
 212                                             CONSENSUS
 213                                             CONSENSUS_META
 214                                             )], @args);
 215   $src && $self->source($src);
 216   defined $score && $self->score($score);
 217   # we need to set up internal hashs first!
 218
 219   $self->{'_seq'} = {};
 220   $self->{'_order'} = {};
 221   $self->{'_start_end_lists'} = {};
 222   $self->{'_dis_name'} = {};
 223   $self->{'_id'} = 'NoName';
 224   # maybe we should automatically read in from args. Hmmm...
 225   $id  && $self->id($id);
 226   $acc && $self->accession($acc);
 227   $desc && $self->description($desc);
 228   $coll && $self->annotation($coll);
 229   # sequence annotation is layered into a provided annotation collection (or dies)
 230   if ($sa) {
 231     $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
 232                  "with a sequence annotation collection")
 233         if !$coll;
 234     $coll->add_Annotation('seq_annotation', $sa);
 235   }
 236   if ($feats && ref $feats eq 'ARRAY') {
 237     for my $feat (@$feats) {
 238         $self->add_SeqFeature($feat);
 239     }
 240   }
 241   $con && $self->consensus($con);
 242   $cmeta && $self->consensus_meta($cmeta);
 243   # assumes these are in correct alignment order
 244   if ($seqs && ref($seqs) eq 'ARRAY') {
 245     for my $seq (@$seqs) {
 246         $self->add_seq($seq);
 247     }
 248   }
 249
 250   return $self; # success - we hope!
 251 }
 252
 253 =head1 Modifier methods
 254
 255 These methods modify the MSA by adding, removing or shuffling complete
 256 sequences.
 257
 258 =head2 add_seq
 259
 260  Title     : add_seq
 261  Usage     : $myalign->add_seq($newseq);
 262  Function  : Adds another sequence to the alignment. *Does not* align
 263              it - just adds it to the hashes.
 264  Returns   : nothing
 265  Args      : a Bio::LocatableSeq object
 266              order (optional)
 267
 268 See L<Bio::LocatableSeq> for more information
 269
 270 =cut
 271
 272 sub addSeq {
 273     my $self = shift;
 274     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 275     $self->add_seq(@_);
 276 }
 277
 278 sub add_seq {
 279     my $self = shift;
 280     my $seq  = shift;
 281     my $order = shift;
 282     my ($name,$id,$start,$end);
 283
 284     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 285         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 286     }
 287
 288     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 289
 290     # build the symbol list for this sequence,
 291     # will prune out the gap and missing/match chars
 292     # when actually asked for the symbol list in the
 293     # symbol_chars
 294     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 295
 296     if( !defined $order ) {
 297         $order = keys %{$self->{'_seq'}};
 298     }
 299     $name = $seq->get_nse;
 300
 301     if( $self->{'_seq'}->{$name} ) {
 302         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 303     }
 304     else {
 305         $self->debug( "Assigning $name to $order\n");
 306
 307         $self->{'_order'}->{$order} = $name;
 308
 309         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 310             $self->{'_start_end_lists'}->{$id} = [];
 311         }
 312         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 313     }
 314
 315     $self->{'_seq'}->{$name} = $seq;
 316
 317 }
 318
 319
 320 =head2 remove_seq
 321
 322  Title     : remove_seq
 323  Usage     : $aln->remove_seq($seq);
 324  Function  : Removes a single sequence from an alignment
 325  Returns   :
 326  Argument  : a Bio::LocatableSeq object
 327
 328 =cut
 329
 330 sub removeSeq {
 331     my $self = shift;
 332     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 333     $self->remove_seq(@_);
 334 }
 335
 336 sub remove_seq {
 337     my $self = shift;
 338     my $seq = shift;
 339     my ($name,$id,$start,$end);
 340
 341     $self->throw("Need Bio::Locatable seq argument ")
 342         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 343
 344     $id = $seq->id();
 345     $start = $seq->start();
 346     $end  = $seq->end();
 347     $name = sprintf("%s/%d-%d",$id,$start,$end);
 348
 349     if( !exists $self->{'_seq'}->{$name} ) {
 350         $self->throw("Sequence $name does not exist in the alignment to remove!");
 351     }
 352
 353     delete $self->{'_seq'}->{$name};
 354
 355     # we need to remove this seq from the start_end_lists hash
 356
 357     if (exists $self->{'_start_end_lists'}->{$id}) {
 358         # we need to find the sequence in the array.
 359
 360         my ($i, $found);;
 361         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 362             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 363                 $found = 1;
 364                 last;
 365             }
 366         }
 367         if ($found) {
 368             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 369         }
 370         else {
 371             $self->throw("Could not find the sequence to remoce from the start-end list");
 372         }
 373     }
 374     else {
 375         $self->throw("There is no seq list for the name $id");
 376     }
 377     # we need to shift order hash
 378     my %rev_order = reverse %{$self->{'_order'}};
 379     my $no = $rev_order{$name};
 380     my $no_sequences = $self->no_sequences;
 381     for (; $no < $no_sequences; $no++) {
 382        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 383     }
 384     delete $self->{'_order'}->{$no};
 385     return 1;
 386 }
 387
 388
 389 =head2 purge
 390
 391  Title   : purge
 392  Usage   : $aln->purge(0.7);
 393  Function: Removes sequences above given sequence similarity
 394            This function will grind on large alignments. Beware!
 395  Example :
 396  Returns : An array of the removed sequences
 397  Args    : float, threshold for similarity
 398
 399 =cut
 400
 401 sub purge {
 402         my ($self,$perc) = @_;
 403         my (%duplicate, @dups);
 404
 405         my @seqs = $self->each_seq();
 406
 407         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 408                 my $seq = $seqs[$i];
 409
 410                 #skip if already in duplicate hash
 411                 next if exists $duplicate{$seq->display_id} ;
 412                 my $one = $seq->seq();
 413
 414                 my @one = split '', $one;       #split to get 1aa per array element
 415
 416                 for (my $j=$i+1;$j < @seqs;$j++) {
 417                         my $seq2 = $seqs[$j];
 418
 419                         #skip if already in duplicate hash
 420                         next if exists $duplicate{$seq2->display_id} ;
 421
 422                         my $two = $seq2->seq();
 423                         my @two = split '', $two;
 424
 425                         my $count = 0;
 426                         my $res = 0;
 427                         for (my $k=0;$k<@one;$k++) {
 428                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 429                                           $one[$k] eq $two[$k]) {
 430                                         $count++;
 431                                 }
 432                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 433                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 434                                         $res++;
 435                                 }
 436                         }
 437
 438                         my $ratio = 0;
 439                         $ratio = $count/$res unless $res == 0;
 440
 441                         # if above threshold put in duplicate hash and push onto
 442                         # duplicate array for returning to get_unique
 443                         if ( $ratio > $perc ) {
 444                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 445                                 $duplicate{$seq2->display_id} = 1;
 446                                 push @dups, $seq2;
 447                         }
 448                 }
 449         }
 450         foreach my $seq (@dups) {
 451                 $self->remove_seq($seq);
 452         }
 453         return @dups;
 454 }
 455
 456 =head2 sort_alphabetically
 457
 458  Title     : sort_alphabetically
 459  Usage     : $ali->sort_alphabetically
 460  Function  : Changes the order of the alignment to alphabetical on name
 461              followed by numerical by number.
 462  Returns   :
 463  Argument  :
 464
 465 =cut
 466
 467 sub sort_alphabetically {
 468     my $self = shift;
 469     my ($seq,$nse,@arr,%hash,$count);
 470
 471     foreach $seq ( $self->each_seq() ) {
 472         $nse = $seq->get_nse;
 473         $hash{$nse} = $seq;
 474     }
 475
 476     $count = 0;
 477
 478     %{$self->{'_order'}} = (); # reset the hash;
 479
 480     foreach $nse ( sort _alpha_startend keys %hash) {
 481         $self->{'_order'}->{$count} = $nse;
 482
 483         $count++;
 484     }
 485     1;
 486 }
 487
 488 =head2 sort_by_list
 489
 490  Title     : sort_by_list
 491  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 492  Function  : Arbitrarily order sequences in an alignment
 493  Returns   : A new Bio::SimpleAlign object
 494  Argument  : a file listing sequence names in intended order (one name per line)
 495
 496 =cut
 497
 498 sub sort_by_list {
 499     my ($self, $list) = @_;
 500     my (@seq, @ids, %order);
 501
 502     foreach my $seq ( $self->each_seq() ) {
 503         push @seq, $seq;
 504         push @ids, $seq->display_id;
 505     }
 506
 507     my $ct=1;
 508     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 509     while (<$listfh>) {
 510       chomp;
 511       my $name=$_;
 512       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 513       $order{$name}=$ct++;
 514     }
 515     close($listfh);
 516
 517     # use the map-sort-map idiom:
 518     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 519     my $aln = $self->new;
 520     foreach (@sorted) { $aln->add_seq($_) }
 521     return $aln;
 522 }
 523
 524 =head2 set_new_reference
 525
 526  Title     : set_new_reference
 527  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 528              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 529              as the reference (1st) sequence
 530  Function  : Change/Set a new reference (i.e., the first) sequence
 531  Returns   : a new Bio::SimpleAlign object.
 532              Throws an exception if designated sequence not found
 533  Argument  : a positive integer of sequence order, or a sequence name
 534              in the original alignment
 535
 536 =cut
 537
 538 sub set_new_reference {
 539     my ($self, $seqid) = @_;
 540     my $aln = $self->new;
 541     my (@seq, @ids, @new_seq);
 542     my $is_num=0;
 543     foreach my $seq ( $self->each_seq() ) {
 544         push @seq, $seq;
 545         push @ids, $seq->display_id;
 546     }
 547
 548     if ($seqid =~ /^\d+$/) { # argument is seq position
 549         $is_num=1;
 550         $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
 551     } else { # argument is a seq name
 552         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 553     }
 554
 555     for (my $i=0; $i<=$#seq; $i++) {
 556         my $pos=$i+1;
 557         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 558             unshift @new_seq, $seq[$i];
 559         } else {
 560             push @new_seq, $seq[$i];
 561         }
 562     }
 563     foreach (@new_seq) { $aln->add_seq($_);  }
 564     return $aln;
 565 }
 566
 567 sub _in_aln {  # check if input name exists in the alignment
 568     my ($str, $ref) = @_;
 569     foreach (@$ref) {
 570         return 1 if $str eq $_;
 571     }
 572     return 0;
 573 }
 574
 575
 576 =head2 uniq_seq
 577
 578  Title     : uniq_seq
 579  Usage     : $aln->uniq_seq():  Remove identical sequences in
 580              in the alignment.  Ambiguous base ("N", "n") and
 581              leading and ending gaps ("-") are NOT counted as
 582              differences.
 583  Function  : Make a new alignment of unique sequence types (STs)
 584  Returns   : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
 585              2. ST of each sequence in STDERR
 586  Argument  : None
 587
 588 =cut
 589
 590 sub uniq_seq {
 591     my ($self, $seqid) = @_;
 592     my $aln = $self->new;
 593     my (%member, %order, @seq, @uniq_str);
 594     my $order=0;
 595     my $len = $self->length();
 596     foreach my $seq ( $self->each_seq() ) {
 597         my $str = $seq->seq();
 598
 599 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 600 # comparing two sequence strings
 601
 602     # 1st, convert "n", "N" to "?" (for DNA sequence only):
 603         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 604     # 2nd, convert leading and ending gaps to "?":
 605         $str = &_convert_leading_ending_gaps($str, '-', '?');
 606     # Note that '?' also can mean unknown residue.
 607     # I don't like making global class member changes like this, too
 608     # prone to errors... -- cjfields 08-11-18
 609     local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
 610         my $new = Bio::LocatableSeq->new(
 611                      -id      => $seq->id(),
 612                                          -alphabet=> $seq->alphabet,
 613                                          -seq     => $str,
 614                                          -start   => $seq->start,
 615                                          -end     => $seq->end
 616                                          );
 617         push @seq, $new;
 618     }
 619
 620     foreach my $seq (@seq) {
 621         my $str = $seq->seq();
 622         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 623         if ($seen) { # seen before
 624             my @memb = @{$member{$key}};
 625             push @memb, $seq;
 626             $member{$key} = \@memb;
 627         } else {  # not seen
 628             push @uniq_str, $key;
 629             $order++;
 630             $member{$key} = [ ($seq) ];
 631             $order{$key} = $order;
 632         }
 633     }
 634
 635     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 636 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 637         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 638 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 639         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 640         my $gap='-';
 641         my $end=length($str2);
 642         $end -= length($1) while $str2 =~ m/($gap+)/g;
 643         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 644                                          -seq  =>$str2,
 645                                          -start=>1,
 646                                          -end  =>$end
 647                                          );
 648         $aln->add_seq($new);
 649 #       print STDERR "ST".$order{$str}, "\t=>";
 650         foreach (@{$member{$str}}) {
 651         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 652     }
 653 #       print STDERR "\n";
 654     }
 655     return $aln;
 656 }
 657
 658 sub _check_uniq {  # check if same seq exists in the alignment
 659     my ($str1, $ref, $length) = @_;
 660     my @char1=split //, $str1;
 661     my @array=@$ref;
 662
 663     return (0, $str1) if @array==0; # not seen (1st sequence)
 664
 665     foreach my $str2 (@array) {
 666         my $diff=0;
 667         my @char2=split //, $str2;
 668         for (my $i=0; $i<=$length-1; $i++) {
 669             next if $char1[$i] eq '?';
 670             next if $char2[$i] eq '?';
 671             $diff++ if $char1[$i] ne $char2[$i];
 672         }
 673         return (1, $str2) if $diff == 0;  # seen before
 674     }
 675
 676     return (0, $str1); # not seen
 677 }
 678
 679 sub _convert_leading_ending_gaps {
 680     my $s=shift;
 681     my $sym1=shift;
 682     my $sym2=shift;
 683     my @array=split //, $s;
 684 # convert leading char:
 685     for (my $i=0; $i<=$#array; $i++) {
 686         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 687     }
 688 # convert ending char:
 689     for (my $i = $#array; $i>= 0; $i--) {
 690         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 691     }
 692     my $s_new=join '', @array;
 693     return $s_new;
 694 }
 695
 696 =head1 Sequence selection methods
 697
 698 Methods returning one or more sequences objects.
 699
 700 =head2 each_seq
 701
 702  Title     : each_seq
 703  Usage     : foreach $seq ( $align->each_seq() )
 704  Function  : Gets a Seq object from the alignment
 705  Returns   : Seq object
 706  Argument  :
 707
 708 =cut
 709
 710 sub eachSeq {
 711     my $self = shift;
 712     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 713     $self->each_seq();
 714 }
 715
 716 sub each_seq {
 717         my $self = shift;
 718         my (@arr,$order);
 719
 720         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 721                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 722                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 723                 }
 724         }
 725         return @arr;
 726 }
 727
 728
 729 =head2 each_alphabetically
 730
 731  Title     : each_alphabetically
 732  Usage     : foreach $seq ( $ali->each_alphabetically() )
 733  Function  : Returns a sequence object, but the objects are returned
 734              in alphabetically sorted order.
 735              Does not change the order of the alignment.
 736  Returns   : Seq object
 737  Argument  :
 738
 739 =cut
 740
 741 sub each_alphabetically {
 742         my $self = shift;
 743         my ($seq,$nse,@arr,%hash,$count);
 744
 745         foreach $seq ( $self->each_seq() ) {
 746                 $nse = $seq->get_nse;
 747                 $hash{$nse} = $seq;
 748         }
 749
 750         foreach $nse ( sort _alpha_startend keys %hash) {
 751                 push(@arr,$hash{$nse});
 752         }
 753         return @arr;
 754 }
 755
 756 sub _alpha_startend {
 757     my ($aname,$astart,$bname,$bstart);
 758     ($aname,$astart) = split (/-/,$a);
 759     ($bname,$bstart) = split (/-/,$b);
 760
 761     if( $aname eq $bname ) {
 762         return $astart <=> $bstart;
 763     }
 764     else {
 765         return $aname cmp $bname;
 766     }
 767 }
 768
 769 =head2 each_seq_with_id
 770
 771  Title     : each_seq_with_id
 772  Usage     : foreach $seq ( $align->each_seq_with_id() )
 773  Function  : Gets a Seq objects from the alignment, the contents
 774              being those sequences with the given name (there may be
 775              more than one)
 776  Returns   : Seq object
 777  Argument  : a seq name
 778
 779 =cut
 780
 781 sub eachSeqWithId {
 782     my $self = shift;
 783     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 784     $self->each_seq_with_id(@_);
 785 }
 786
 787 sub each_seq_with_id {
 788     my $self = shift;
 789     my $id = shift;
 790
 791     $self->throw("Method each_seq_with_id needs a sequence name argument")
 792         unless defined $id;
 793
 794     my (@arr, $seq);
 795
 796     if (exists($self->{'_start_end_lists'}->{$id})) {
 797         @arr = @{$self->{'_start_end_lists'}->{$id}};
 798     }
 799     return @arr;
 800 }
 801
 802 =head2 get_seq_by_pos
 803
 804  Title     : get_seq_by_pos
 805  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 806  Function  : Gets a sequence based on its position in the alignment.
 807              Numbering starts from 1.  Sequence positions larger than
 808              no_sequences() will thow an error.
 809  Returns   : a Bio::LocatableSeq object
 810  Args      : positive integer for the sequence position
 811
 812 =cut
 813
 814 sub get_seq_by_pos {
 815
 816     my $self = shift;
 817     my ($pos) = @_;
 818
 819     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 820         unless $pos =~ /^\d+$/ and $pos > 0;
 821     $self->throw("No sequence at position [$pos]")
 822         unless $pos <= $self->no_sequences ;
 823
 824     my $nse = $self->{'_order'}->{--$pos};
 825     return $self->{'_seq'}->{$nse};
 826 }
 827
 828 =head2 get_seq_by_id
 829
 830  Title     : get_seq_by_id
 831  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 832  Function  : Gets a sequence based on its name.
 833              Sequences that do not exist will warn and return undef
 834  Returns   : a Bio::LocatableSeq object
 835  Args      : string for sequence name
 836
 837 =cut
 838
 839 sub get_seq_by_id {
 840     my ($self,$name) = @_;
 841     unless( defined $name ) {
 842       $self->warn("Must provide a sequence name");
 843       return;
 844     }
 845     for my $seq ( values %{$self->{'_seq'}} ) {
 846       if ( $seq->id eq $name) {
 847         return $seq;
 848       }
 849     }
 850     return;
 851 }
 852
 853 =head2 seq_with_features
 854
 855  Title   : seq_with_features
 856  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 857                                           -consensus => 60
 858                                           -mask =>
 859            sub { my $consensus = shift;
 860
 861                  for my $i (1..5){
 862                     my $n = 'N' x $i;
 863                     my $q = '\?' x $i;
 864                     while($consensus =~ /[^?]$q[^?]/){
 865                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 866                     }
 867                   }
 868                  return $consensus;
 869                }
 870                                          );
 871  Function: produces a Bio::Seq object by first splicing gaps from -pos
 872            (by means of a splice_by_seq_pos() call), then creating
 873            features using non-? chars (by means of a consensus_string()
 874            call with stringency -consensus).
 875  Returns : a Bio::Seq object
 876  Args    : -pos : required. sequence from which to build the Bio::Seq
 877              object
 878            -consensus : optional, defaults to consensus_string()'s
 879              default cutoff value
 880            -mask : optional, a coderef to apply to consensus_string()'s
 881              output before building features.  this may be useful for
 882              closing gaps of 1 bp by masking over them with N, for
 883              instance
 884
 885 =cut
 886
 887 sub seq_with_features{
 888    my ($self,%arg) = @_;
 889
 890    #first do the preparatory splice
 891    $self->throw("must provide a -pos argument") unless $arg{-pos};
 892    $self->splice_by_seq_pos($arg{-pos});
 893
 894    my $consensus_string = $self->consensus_string($arg{-consensus});
 895    $consensus_string = $arg{-mask}->($consensus_string)
 896          if defined($arg{-mask});
 897
 898    my(@bs,@es);
 899
 900    push @bs, 1 if $consensus_string =~ /^[^?]/;
 901
 902    while($consensus_string =~ /\?[^?]/g){
 903          push @bs, pos($consensus_string);
 904    }
 905    while($consensus_string =~ /[^?]\?/g){
 906          push @es, pos($consensus_string);
 907    }
 908
 909    push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
 910
 911    my $seq = Bio::Seq->new();
 912
 913 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 914 #                -source_tag => 'location',
 915 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 916 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 917 #                                                  );
 918 #   $seq->add_SeqFeature($rootfeature);
 919
 920    while(my $b = shift @bs){
 921          my $e = shift @es;
 922          $seq->add_SeqFeature(
 923        Bio::SeqFeature::Generic->new(
 924          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 925          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 926          -source_tag => $self->source || 'MSA',
 927        )
 928      );
 929    }
 930
 931    return $seq;
 932 }
 933
 934
 935 =head1 Create new alignments
 936
 937 The result of these methods are horizontal or vertical subsets of the
 938 current MSA.
 939
 940 =head2 select
 941
 942  Title     : select
 943  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 944  Function  : Creates a new alignment from a continuous subset of
 945              sequences.  Numbering starts from 1.  Sequence positions
 946              larger than no_sequences() will thow an error.
 947  Returns   : a Bio::SimpleAlign object
 948  Args      : positive integer for the first sequence
 949              positive integer for the last sequence to include (optional)
 950
 951 =cut
 952
 953 sub select {
 954     my $self = shift;
 955     my ($start, $end) = @_;
 956
 957     $self->throw("Select start has to be a positive integer, not [$start]")
 958         unless $start =~ /^\d+$/ and $start > 0;
 959     $self->throw("Select end has to be a positive integer, not [$end]")
 960         unless $end  =~ /^\d+$/ and $end > 0;
 961     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
 962         unless $start <= $end;
 963
 964     my $aln = $self->new;
 965     foreach my $pos ($start .. $end) {
 966         $aln->add_seq($self->get_seq_by_pos($pos));
 967     }
 968     $aln->id($self->id);
 969     # fix for meta, sf, ann
 970     return $aln;
 971 }
 972
 973 =head2 select_noncont
 974
 975  Title     : select_noncont
 976  Usage     : # 1st and 3rd sequences, sorted
 977              $aln2 = $aln->select_noncont(1, 3)
 978
 979              # 1st and 3rd sequences, sorted (same as first)
 980              $aln2 = $aln->select_noncont(3, 1)
 981
 982              # 1st and 3rd sequences, unsorted
 983              $aln2 = $aln->select_noncont('nosort',3, 1)
 984
 985  Function  : Creates a new alignment from a subset of sequences.  Numbering
 986              starts from 1.  Sequence positions larger than no_sequences() will
 987              throw an error.  Sorts the order added to new alignment by default,
 988              to prevent sorting pass 'nosort' as the first argument in the list.
 989  Returns   : a Bio::SimpleAlign object
 990  Args      : array of integers for the sequences.  If the string 'nosort' is
 991              passed as the first argument, the sequences will not be sorted
 992              in the new alignment but will appear in the order listed.
 993
 994 =cut
 995
 996 sub select_noncont {
 997         my $self = shift;
 998     my $nosort = 0;
 999         my (@pos) = @_;
1000     if ($pos[0] !~ m{^\d+$}) {
1001         my $sortcmd = shift @pos;
1002         if ($sortcmd eq 'nosort') {
1003             $nosort = 1;
1004         } else {
1005             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
1006         }
1007     }
1008
1009     my $end = $self->no_sequences;
1010     foreach ( @pos ) {
1011                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1012                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1013         }
1014
1015         @pos = sort {$a <=> $b} @pos unless $nosort;
1016
1017         my $aln = $self->new;
1018         foreach my $p (@pos) {
1019                 $aln->add_seq($self->get_seq_by_pos($p));
1020         }
1021         $aln->id($self->id);
1022     # fix for meta, sf, ann
1023         return $aln;
1024 }
1025
1026 =head2 slice
1027
1028  Title     : slice
1029  Usage     : $aln2 = $aln->slice(20,30)
1030  Function  : Creates a slice from the alignment inclusive of start and
1031              end columns, and the first column in the alignment is denoted 1.
1032              Sequences with no residues in the slice are excluded from the
1033              new alignment and a warning is printed. Slice beyond the length of
1034              the sequence does not do padding.
1035  Returns   : A Bio::SimpleAlign object
1036  Args      : Positive integer for start column, positive integer for end column,
1037              optional boolean which if true will keep gap-only columns in the newly
1038              created slice. Example:
1039
1040              $aln2 = $aln->slice(20,30,1)
1041
1042 =cut
1043
1044 sub slice {
1045         my $self = shift;
1046         my ($start, $end, $keep_gap_only) = @_;
1047
1048         $self->throw("Slice start has to be a positive integer, not [$start]")
1049           unless $start =~ /^\d+$/ and $start > 0;
1050         $self->throw("Slice end has to be a positive integer, not [$end]")
1051           unless $end =~ /^\d+$/ and $end > 0;
1052         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1053           unless $start <= $end;
1054         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1055                                          "[$start] is too big.") if $start > $self->length;
1056     my $cons_meta = $self->consensus_meta;
1057         my $aln = $self->new;
1058         $aln->id($self->id);
1059         foreach my $seq ( $self->each_seq() ) {
1060             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1061             Bio::Seq::Meta->new
1062         (-id      => $seq->id,
1063                  -alphabet => $seq->alphabet,
1064                  -strand  => $seq->strand,
1065                  -verbose => $self->verbose) :
1066             Bio::LocatableSeq->new
1067         (-id      => $seq->id,
1068                  -alphabet => $seq->alphabet,
1069                  -strand  => $seq->strand,
1070                  -verbose => $self->verbose);
1071
1072             # seq
1073             my $seq_end = $end;
1074             $seq_end = $seq->length if( $end > $seq->length );
1075
1076             my $slice_seq = $seq->subseq($start, $seq_end);
1077             $new_seq->seq( $slice_seq );
1078
1079             $slice_seq =~ s/\W//g;
1080
1081             if ($start > 1) {
1082             my $pre_start_seq = $seq->subseq(1, $start - 1);
1083             $pre_start_seq =~ s/\W//g;
1084             if (!defined($seq->strand)) {
1085                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1086             } elsif ($seq->strand < 0){
1087                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1088             } else {
1089                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1090             }
1091             } else {
1092             $new_seq->start( $seq->start);
1093             }
1094         if ($new_seq->isa('Bio::Seq::MetaI')) {
1095             for my $meta_name ($seq->meta_names) {
1096                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1097             }
1098         }
1099             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1100
1101             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1102             $aln->add_seq($new_seq);
1103             } else {
1104             if( $keep_gap_only ) {
1105                 $aln->add_seq($new_seq);
1106             } else {
1107                 my $nse = $seq->get_nse();
1108                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1109                     " Sequence excluded from the new alignment.");
1110             }
1111             }
1112         }
1113     if ($cons_meta) {
1114         my $new = Bio::Seq::Meta->new();
1115         for my $meta_name ($cons_meta->meta_names) {
1116             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1117         }
1118         $aln->consensus_meta($new);
1119     }
1120     $aln->annotation($self->annotation);
1121     # fix for meta, sf, ann
1122         return $aln;
1123 }
1124
1125 =head2 remove_columns
1126
1127  Title     : remove_columns
1128  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1129              $aln2 = $aln->remove_columns([0,0],[6,8])
1130  Function  : Creates an aligment with columns removed corresponding to
1131              the specified type or by specifying the columns by number.
1132  Returns   : Bio::SimpleAlign object
1133  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1134              'all_gaps_columns') or array ref where the referenced array
1135              contains a pair of integers that specify a range.
1136              The first column is 0,
1137
1138 =cut
1139
1140 sub remove_columns {
1141         my ($self,@args) = @_;
1142         @args || return $self;
1143    my $aln;
1144
1145         if ($args[0][0] =~ /^[a-z_]+$/i) {
1146                  $aln = $self->_remove_columns_by_type($args[0]);
1147         } elsif ($args[0][0] =~ /^\d+$/) {
1148        $aln = $self->_remove_columns_by_num(\@args);
1149         } else {
1150                  $self->throw("You must pass array references to remove_columns(), not @args");
1151         }
1152     # fix for meta, sf, ann
1153    $aln;
1154 }
1155
1156
1157 =head2 remove_gaps
1158
1159  Title     : remove_gaps
1160  Usage     : $aln2 = $aln->remove_gaps
1161  Function  : Creates an aligment with gaps removed
1162  Returns   : a Bio::SimpleAlign object
1163  Args      : a gap character(optional) if none specified taken
1164                 from $self->gap_char,
1165              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1166                         indicates that only all-gaps columns should be deleted
1167
1168 Used from method L<remove_columns> in most cases. Set gap character
1169 using L<gap_char()|gap_char>.
1170
1171 =cut
1172
1173 sub remove_gaps {
1174     my ($self,$gapchar,$all_gaps_columns) = @_;
1175     my $gap_line;
1176     if ($all_gaps_columns) {
1177         $gap_line = $self->all_gap_line($gapchar);
1178     } else {
1179         $gap_line = $self->gap_line($gapchar);
1180     }
1181     my $aln = $self->new;
1182
1183     my @remove;
1184     my $length = 0;
1185     my $del_char = $gapchar || $self->gap_char;
1186     # Do the matching to get the segments to remove
1187     while ($gap_line =~ m/[$del_char]/g) {
1188         my $start = pos($gap_line)-1;
1189         $gap_line=~/\G[$del_char]+/gc;
1190         my $end = pos($gap_line)-1;
1191
1192         #have to offset the start and end for subsequent removes
1193         $start-=$length;
1194         $end  -=$length;
1195         $length += ($end-$start+1);
1196         push @remove, [$start,$end];
1197     }
1198
1199     #remove the segments
1200     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1201     # fix for meta, sf, ann
1202     return $aln;
1203 }
1204
1205
1206 sub _remove_col {
1207     my ($self,$aln,$remove) = @_;
1208     my @new;
1209
1210     my $gap = $self->gap_char;
1211
1212     # splice out the segments and create new seq
1213     foreach my $seq($self->each_seq){
1214         my $new_seq = Bio::LocatableSeq->new(
1215                                              -id      => $seq->id,
1216                                              -alphabet=> $seq->alphabet,
1217                                              -strand  => $seq->strand,
1218                                              -verbose => $self->verbose);
1219         my $sequence = $seq->seq;
1220         foreach my $pair(@{$remove}){
1221             my $start = $pair->[0];
1222             my $end   = $pair->[1];
1223             $sequence = $seq->seq unless $sequence;
1224             my $orig = $sequence;
1225             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1226             my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1227             $sequence = $head.$tail;
1228             # start
1229             unless (defined $new_seq->start) {
1230                 if ($start == 0) {
1231                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1232                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1233                 }
1234                 else {
1235                     my $start_adjust = $orig =~ /^$gap+/;
1236                     if ($start_adjust) {
1237                         $start_adjust = $+[0] == $start;
1238                     }
1239                     $new_seq->start($seq->start + $start_adjust);
1240                 }
1241             }
1242             # end
1243             if (($end + 1) >= length($orig)) {
1244                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1245                 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1246             }
1247             else {
1248                 $new_seq->end($seq->end);
1249             }
1250         }
1251
1252         if ($new_seq->end < $new_seq->start) {
1253             # we removed all columns except for gaps: set to 0 to indicate no
1254             # sequence
1255             $new_seq->start(0);
1256             $new_seq->end(0);
1257         }
1258
1259         $new_seq->seq($sequence) if $sequence;
1260                 push @new, $new_seq;
1261     }
1262     # add the new seqs to the alignment
1263     foreach my $new(@new){
1264         $aln->add_seq($new);
1265     }
1266     # fix for meta, sf, ann
1267     return $aln;
1268 }
1269
1270 sub _remove_columns_by_type {
1271         my ($self,$type) = @_;
1272         my $aln = $self->new;
1273         my @remove;
1274
1275         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1276         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1277         my %matchchars = ( 'match'           => '\*',
1278                        'weak'             => '\.',
1279                        'strong'           => ':',
1280                        'mismatch'         => ' ',
1281                        'gaps'             => '',
1282                        'all_gaps_columns' => ''
1283                      );
1284         # get the characters to delete against
1285         my $del_char;
1286         foreach my $type (@{$type}){
1287                 $del_char.= $matchchars{$type};
1288         }
1289
1290         my $length = 0;
1291         my $match_line = $self->match_line;
1292         # do the matching to get the segments to remove
1293         if($del_char){
1294                 while($match_line =~ m/[$del_char]/g ){
1295                         my $start = pos($match_line)-1;
1296                         $match_line=~/\G[$del_char]+/gc;
1297                         my $end = pos($match_line)-1;
1298
1299                         #have to offset the start and end for subsequent removes
1300                         $start-=$length;
1301                         $end  -=$length;
1302                         $length += ($end-$start+1);
1303                         push @remove, [$start,$end];
1304                 }
1305         }
1306
1307         # remove the segments
1308         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1309         $aln = $aln->remove_gaps() if $gap;
1310         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1311     # fix for meta, sf, ann
1312         $aln;
1313 }
1314
1315
1316 sub _remove_columns_by_num {
1317         my ($self,$positions) = @_;
1318         my $aln = $self->new;
1319
1320         # sort the positions
1321         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1322
1323     my @remove;
1324     my $length = 0;
1325     foreach my $pos (@{$positions}) {
1326         my ($start, $end) = @{$pos};
1327
1328         #have to offset the start and end for subsequent removes
1329         $start-=$length;
1330         $end  -=$length;
1331         $length += ($end-$start+1);
1332         push @remove, [$start,$end];
1333     }
1334
1335     #remove the segments
1336     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1337     # fix for meta, sf, ann
1338         $aln;
1339 }
1340
1341
1342 =head1 Change sequences within the MSA
1343
1344 These methods affect characters in all sequences without changing the
1345 alignment.
1346
1347 =head2 splice_by_seq_pos
1348
1349  Title   : splice_by_seq_pos
1350  Usage   : $status = splice_by_seq_pos(1);
1351  Function: splices all aligned sequences where the specified sequence
1352            has gaps.
1353  Example :
1354  Returns : 1 on success
1355  Args    : position of sequence to splice by
1356
1357
1358 =cut
1359
1360 sub splice_by_seq_pos{
1361   my ($self,$pos) = @_;
1362
1363   my $guide = $self->get_seq_by_pos($pos);
1364   my $guide_seq = $guide->seq;
1365
1366   $guide_seq =~ s/\./\-/g;
1367
1368   my @gaps = ();
1369   $pos = -1;
1370   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1371     unshift @gaps, $pos;
1372     $pos++;
1373   }
1374
1375   foreach my $seq ($self->each_seq){
1376     my @bases = split '', $seq->seq;
1377
1378     splice(@bases, $_, 1) foreach @gaps;
1379     $seq->seq(join('', @bases));
1380   }
1381
1382   1;
1383 }
1384
1385 =head2 map_chars
1386
1387  Title     : map_chars
1388  Usage     : $ali->map_chars('\.','-')
1389  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1390              characters
1391
1392              Notice that the from (arg1) is interpretted as a regex,
1393              so be careful about quoting meta characters (eg
1394              $ali->map_chars('.','-') wont do what you want)
1395  Returns   :
1396  Argument  : 'from' rexexp
1397              'to' string
1398
1399 =cut
1400
1401 sub map_chars {
1402     my $self = shift;
1403     my $from = shift;
1404     my $to   = shift;
1405     my ($seq,$temp);
1406
1407     $self->throw("Need exactly two arguments")
1408         unless defined $from and defined $to;
1409
1410     foreach $seq ( $self->each_seq() ) {
1411         $temp = $seq->seq();
1412         $temp =~ s/$from/$to/g;
1413         $seq->seq($temp);
1414     }
1415     return 1;
1416 }
1417
1418
1419 =head2 uppercase
1420
1421  Title     : uppercase()
1422  Usage     : $ali->uppercase()
1423  Function  : Sets all the sequences to uppercase
1424  Returns   :
1425  Argument  :
1426
1427 =cut
1428
1429 sub uppercase {
1430     my $self = shift;
1431     my $seq;
1432     my $temp;
1433
1434     foreach $seq ( $self->each_seq() ) {
1435       $temp = $seq->seq();
1436       $temp =~ tr/[a-z]/[A-Z]/;
1437
1438       $seq->seq($temp);
1439     }
1440     return 1;
1441 }
1442
1443 =head2 cigar_line
1444
1445  Title    : cigar_line()
1446  Usage    : %cigars = $align->cigar_line()
1447  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1448             Report) line for each sequence in the alignment. Examples are
1449             "1,60" or "5,10:12,58", where the numbers refer to conserved
1450             positions within the alignment. The keys of the hash are the
1451             NSEs (name/start/end) assigned to each sequence.
1452  Args     : threshold (optional, defaults to 100)
1453  Returns  : Hash of strings (cigar lines)
1454
1455 =cut
1456
1457 sub cigar_line {
1458         my $self = shift;
1459         my $thr=shift||100;
1460         my %cigars;
1461
1462         my @consensus = split "",($self->consensus_string($thr));
1463         my $len = $self->length;
1464         my $gapchar = $self->gap_char;
1465
1466         # create a precursor, something like (1,4,5,6,7,33,45),
1467         # where each number corresponds to a conserved position
1468         foreach my $seq ( $self->each_seq ) {
1469                 my @seq = split "", uc ($seq->seq);
1470                 my $pos = 1;
1471                 for (my $x = 0 ; $x < $len ; $x++ ) {
1472                         if ($seq[$x] eq $consensus[$x]) {
1473                                 push @{$cigars{$seq->get_nse}},$pos;
1474                                 $pos++;
1475                         } elsif ($seq[$x] ne $gapchar) {
1476                                 $pos++;
1477                         }
1478                 }
1479         }
1480         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1481         for my $name (keys %cigars) {
1482                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1483                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1484       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1485            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1486                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1487                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1488                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1489                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1490                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1491                         }
1492       }
1493         }
1494   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1495   for my $name (keys %cigars) {
1496           my @remove;
1497           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1498                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1499                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1500                       unshift @remove,$x;
1501               }
1502            }
1503       for my $pos (@remove) {
1504                         splice @{$cigars{$name}}, $pos, 1;
1505            }
1506    }
1507    # join and punctuate
1508    for my $name (keys %cigars) {
1509           my ($start,$end,$str) = "";
1510           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1511                   $str .= ($start . "," . $end . ":");
1512           }
1513           $str =~ s/:$//;
1514       $cigars{$name} = $str;
1515    }
1516    %cigars;
1517 }
1518
1519
1520 =head2 match_line
1521
1522  Title    : match_line()
1523  Usage    : $line = $align->match_line()
1524  Function : Generates a match line - much like consensus string
1525             except that a line indicating the '*' for a match.
1526  Args     : (optional) Match line characters ('*' by default)
1527             (optional) Strong match char (':' by default)
1528             (optional) Weak match char ('.' by default)
1529  Returns  : String
1530
1531 =cut
1532
1533 sub match_line {
1534         my ($self,$matchlinechar, $strong, $weak) = @_;
1535         my %matchchars = ('match'    => $matchlinechar || '*',
1536                                                           'weak'     => $weak          || '.',
1537                                                           'strong'   => $strong        || ':',
1538                                                           'mismatch' => ' ',
1539                                                   );
1540
1541         my @seqchars;
1542         my $alphabet;
1543         foreach my $seq ( $self->each_seq ) {
1544                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1545                 $alphabet = $seq->alphabet unless defined $alphabet;
1546         }
1547         my $refseq = shift @seqchars;
1548         # let's just march down the columns
1549         my $matchline;
1550  POS:
1551         foreach my $pos ( 0..$self->length ) {
1552                 my $refchar = $refseq->[$pos];
1553                 my $char = $matchchars{'mismatch'};
1554                 unless( defined $refchar ) {
1555                         last if $pos == $self->length; # short circuit on last residue
1556                         # this in place to handle jason's soon-to-be-committed
1557                         # intron mapping code
1558                         goto bottom;
1559                 }
1560                 my %col = ($refchar => 1);
1561                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1562                 foreach my $seq ( @seqchars ) {
1563                         next if $pos >= scalar @$seq;
1564                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1565                                                           $seq->[$pos] eq ' ' );
1566                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1567                 }
1568                 my @colresidues = sort keys %col;
1569
1570                 # if all the values are the same
1571                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1572                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1573                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1574                         # matches for protein seqs
1575             TYPE:
1576                         foreach my $type ( qw(strong weak) ) {
1577                                 # iterate through categories
1578                                 my %groups;
1579                                 # iterate through each of the aa in the col
1580                                 # look to see which groups it is in
1581                                 foreach my $c ( @colresidues ) {
1582                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1583                                                 push @{$groups{$f}},$c;
1584                                         }
1585                                 }
1586                          GRP:
1587                                 foreach my $cols ( values %groups ) {
1588                                         @$cols = sort @$cols;
1589                                         # now we are just testing to see if two arrays
1590                                         # are identical w/o changing either one
1591                                         # have to be same len
1592                                         next if( scalar @$cols != scalar @colresidues );
1593                                         # walk down the length and check each slot
1594                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1595                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1596                                         }
1597                                         $char = $matchchars{$type};
1598                                         last TYPE;
1599                                 }
1600                         }
1601                 }
1602          bottom:
1603                 $matchline .= $char;
1604         }
1605         return $matchline;
1606 }
1607
1608
1609 =head2 gap_line
1610
1611  Title    : gap_line()
1612  Usage    : $line = $align->gap_line()
1613  Function : Generates a gap line - much like consensus string
1614             except that a line where '-' represents gap
1615  Args     : (optional) gap line characters ('-' by default)
1616  Returns  : string
1617
1618 =cut
1619
1620 sub gap_line {
1621     my ($self,$gapchar) = @_;
1622     $gapchar = $gapchar || $self->gap_char;
1623     my %gap_hsh; # column gaps vector
1624     foreach my $seq ( $self->each_seq ) {
1625                 my $i = 0;
1626         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1627                   map {[$i++, $_]} split(//, uc ($seq->seq));
1628     }
1629     my $gap_line;
1630     foreach my $pos ( 0..$self->length-1 ) {
1631           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1632     }
1633     return $gap_line;
1634 }
1635
1636 =head2 all_gap_line
1637
1638  Title    : all_gap_line()
1639  Usage    : $line = $align->all_gap_line()
1640  Function : Generates a gap line - much like consensus string
1641             except that a line where '-' represents all-gap column
1642  Args     : (optional) gap line characters ('-' by default)
1643  Returns  : string
1644
1645 =cut
1646
1647 sub all_gap_line {
1648     my ($self,$gapchar) = @_;
1649     $gapchar = $gapchar || $self->gap_char;
1650     my %gap_hsh;                # column gaps counter hash
1651     my @seqs = $self->each_seq;
1652     foreach my $seq ( @seqs ) {
1653         my $i = 0;
1654         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1655         map {[$i++, $_]} split(//, uc ($seq->seq));
1656     }
1657     my $gap_line;
1658     foreach my $pos ( 0..$self->length-1 ) {
1659         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1660             # gaps column
1661             $gap_line .= $gapchar;
1662         } else {
1663             $gap_line .= '.';
1664         }
1665     }
1666     return $gap_line;
1667 }
1668
1669 =head2 gap_col_matrix
1670
1671  Title    : gap_col_matrix()
1672  Usage    : my $cols = $align->gap_col_matrix()
1673  Function : Generates an array of hashes where
1674             each entry in the array is a hash reference
1675             with keys of all the sequence names and
1676             and value of 1 or 0 if the sequence has a gap at that column
1677  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1678
1679 =cut
1680
1681 sub gap_col_matrix {
1682     my ($self,$gapchar) = @_;
1683     $gapchar = $gapchar || $self->gap_char;
1684     my %gap_hsh; # column gaps vector
1685     my @cols;
1686     foreach my $seq ( $self->each_seq ) {
1687         my $i = 0;
1688         my $str = $seq->seq;
1689         my $len = $seq->length;
1690         my $ch;
1691         my $id = $seq->display_id;
1692         while( $i < $len ) {
1693             $ch = substr($str, $i, 1);
1694             $cols[$i++]->{$id} = ($ch eq $gapchar);
1695         }
1696     }
1697     return \@cols;
1698 }
1699
1700 =head2 match
1701
1702  Title     : match()
1703  Usage     : $ali->match()
1704  Function  : Goes through all columns and changes residues that are
1705              identical to residue in first sequence to match '.'
1706              character. Sets match_char.
1707
1708              USE WITH CARE: Most MSA formats do not support match
1709              characters in sequences, so this is mostly for output
1710              only. NEXUS format (Bio::AlignIO::nexus) can handle
1711              it.
1712  Returns   : 1
1713  Argument  : a match character, optional, defaults to '.'
1714
1715 =cut
1716
1717 sub match {
1718     my ($self, $match) = @_;
1719
1720     $match ||= '.';
1721     my ($matching_char) = $match;
1722     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1723     $self->map_chars($matching_char, '-');
1724
1725     my @seqs = $self->each_seq();
1726     return 1 unless scalar @seqs > 1;
1727
1728     my $refseq = shift @seqs ;
1729     my @refseq = split //, $refseq->seq;
1730     my $gapchar = $self->gap_char;
1731
1732     foreach my $seq ( @seqs ) {
1733         my @varseq = split //, $seq->seq();
1734         for ( my $i=0; $i < scalar @varseq; $i++) {
1735             $varseq[$i] = $match if defined $refseq[$i] &&
1736                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1737                   $refseq[$i] =~ /$gapchar/ )
1738                       && $refseq[$i] eq $varseq[$i];
1739         }
1740         $seq->seq(join '', @varseq);
1741     }
1742     $self->match_char($match);
1743     return 1;
1744 }
1745
1746
1747 =head2 unmatch
1748
1749  Title     : unmatch()
1750  Usage     : $ali->unmatch()
1751  Function  : Undoes the effect of method match. Unsets match_char.
1752  Returns   : 1
1753  Argument  : a match character, optional, defaults to '.'
1754
1755 See L<match> and L<match_char>
1756
1757 =cut
1758
1759 sub unmatch {
1760     my ($self, $match) = @_;
1761
1762     $match ||= '.';
1763
1764     my @seqs = $self->each_seq();
1765     return 1 unless scalar @seqs > 1;
1766
1767     my $refseq = shift @seqs ;
1768     my @refseq = split //, $refseq->seq;
1769     my $gapchar = $self->gap_char;
1770     foreach my $seq ( @seqs ) {
1771         my @varseq = split //, $seq->seq();
1772         for ( my $i=0; $i < scalar @varseq; $i++) {
1773             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1774                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1775                   $refseq[$i] =~ /$gapchar/ ) &&
1776                       $varseq[$i] eq $match;
1777         }
1778         $seq->seq(join '', @varseq);
1779     }
1780     $self->match_char('');
1781     return 1;
1782 }
1783
1784 =head1 MSA attributes
1785
1786 Methods for setting and reading the MSA attributes.
1787
1788 Note that the methods defining character semantics depend on the user
1789 to set them sensibly.  They are needed only by certain input/output
1790 methods. Unset them by setting to an empty string ('').
1791
1792 =head2 id
1793
1794  Title     : id
1795  Usage     : $myalign->id("Ig")
1796  Function  : Gets/sets the id field of the alignment
1797  Returns   : An id string
1798  Argument  : An id string (optional)
1799
1800 =cut
1801
1802 sub id {
1803     my ($self, $name) = @_;
1804
1805     if (defined( $name )) {
1806         $self->{'_id'} = $name;
1807     }
1808
1809     return $self->{'_id'};
1810 }
1811
1812 =head2 accession
1813
1814  Title     : accession
1815  Usage     : $myalign->accession("PF00244")
1816  Function  : Gets/sets the accession field of the alignment
1817  Returns   : An acc string
1818  Argument  : An acc string (optional)
1819
1820 =cut
1821
1822 sub accession {
1823     my ($self, $acc) = @_;
1824
1825     if (defined( $acc )) {
1826         $self->{'_accession'} = $acc;
1827     }
1828
1829     return $self->{'_accession'};
1830 }
1831
1832 =head2 description
1833
1834  Title     : description
1835  Usage     : $myalign->description("14-3-3 proteins")
1836  Function  : Gets/sets the description field of the alignment
1837  Returns   : An description string
1838  Argument  : An description string (optional)
1839
1840 =cut
1841
1842 sub description {
1843     my ($self, $name) = @_;
1844
1845     if (defined( $name )) {
1846         $self->{'_description'} = $name;
1847     }
1848
1849     return $self->{'_description'};
1850 }
1851
1852 =head2 missing_char
1853
1854  Title     : missing_char
1855  Usage     : $myalign->missing_char("?")
1856  Function  : Gets/sets the missing_char attribute of the alignment
1857              It is generally recommended to set it to 'n' or 'N'
1858              for nucleotides and to 'X' for protein.
1859  Returns   : An missing_char string,
1860  Argument  : An missing_char string (optional)
1861
1862 =cut
1863
1864 sub missing_char {
1865     my ($self, $char) = @_;
1866
1867     if (defined $char ) {
1868         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1869         $self->{'_missing_char'} = $char;
1870     }
1871
1872     return $self->{'_missing_char'};
1873 }
1874
1875 =head2 match_char
1876
1877  Title     : match_char
1878  Usage     : $myalign->match_char('.')
1879  Function  : Gets/sets the match_char attribute of the alignment
1880  Returns   : An match_char string,
1881  Argument  : An match_char string (optional)
1882
1883 =cut
1884
1885 sub match_char {
1886     my ($self, $char) = @_;
1887
1888     if (defined $char ) {
1889         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1890         $self->{'_match_char'} = $char;
1891     }
1892
1893     return $self->{'_match_char'};
1894 }
1895
1896 =head2 gap_char
1897
1898  Title     : gap_char
1899  Usage     : $myalign->gap_char('-')
1900  Function  : Gets/sets the gap_char attribute of the alignment
1901  Returns   : An gap_char string, defaults to '-'
1902  Argument  : An gap_char string (optional)
1903
1904 =cut
1905
1906 sub gap_char {
1907     my ($self, $char) = @_;
1908
1909     if (defined $char || ! defined $self->{'_gap_char'} ) {
1910         $char= '-' unless defined $char;
1911         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1912         $self->{'_gap_char'} = $char;
1913     }
1914     return $self->{'_gap_char'};
1915 }
1916
1917 =head2 symbol_chars
1918
1919  Title   : symbol_chars
1920  Usage   : my @symbolchars = $aln->symbol_chars;
1921  Function: Returns all the seen symbols (other than gaps)
1922  Returns : array of characters that are the seen symbols
1923  Args    : boolean to include the gap/missing/match characters
1924
1925 =cut
1926
1927 sub symbol_chars{
1928    my ($self,$includeextra) = @_;
1929
1930    unless ($self->{'_symbols'}) {
1931        foreach my $seq ($self->each_seq) {
1932            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1933        }
1934    }
1935    my %copy = %{$self->{'_symbols'}};
1936    if( ! $includeextra ) {
1937        foreach my $char ( $self->gap_char, $self->match_char,
1938                           $self->missing_char) {
1939            delete $copy{$char} if( defined $char );
1940        }
1941    }
1942    return keys %copy;
1943 }
1944
1945 =head1 Alignment descriptors
1946
1947 These read only methods describe the MSA in various ways.
1948
1949
1950 =head2 score
1951
1952  Title     : score
1953  Usage     : $str = $ali->score()
1954  Function  : get/set a score of the alignment
1955  Returns   : a score for the alignment
1956  Argument  : an optional score to set
1957
1958 =cut
1959
1960 sub score {
1961   my $self = shift;
1962   $self->{score} = shift if @_;
1963   return $self->{score};
1964 }
1965
1966 =head2 consensus_string
1967
1968  Title     : consensus_string
1969  Usage     : $str = $ali->consensus_string($threshold_percent)
1970  Function  : Makes a strict consensus
1971  Returns   : Consensus string
1972  Argument  : Optional treshold ranging from 0 to 100.
1973              The consensus residue has to appear at least threshold %
1974              of the sequences at a given location, otherwise a '?'
1975              character will be placed at that location.
1976              (Default value = 0%)
1977
1978 =cut
1979
1980 sub consensus_string {
1981     my $self = shift;
1982     my $threshold = shift;
1983
1984     my $out = "";
1985     my $len = $self->length - 1;
1986
1987     foreach ( 0 .. $len ) {
1988         $out .= $self->_consensus_aa($_,$threshold);
1989     }
1990     return $out;
1991 }
1992
1993 sub _consensus_aa {
1994     my $self = shift;
1995     my $point = shift;
1996     my $threshold_percent = shift || -1 ;
1997     my ($seq,%hash,$count,$letter,$key);
1998     my $gapchar = $self->gap_char;
1999     foreach $seq ( $self->each_seq() ) {
2000         $letter = substr($seq->seq,$point,1);
2001         $self->throw("--$point-----------") if $letter eq '';
2002         ($letter eq $gapchar || $letter =~ /\./) && next;
2003         # print "Looking at $letter\n";
2004         $hash{$letter}++;
2005     }
2006     my $number_of_sequences = $self->no_sequences();
2007     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2008     $count = -1;
2009     $letter = '?';
2010
2011     foreach $key ( sort keys %hash ) {
2012         # print "Now at $key $hash{$key}\n";
2013         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2014             $letter = $key;
2015             $count = $hash{$key};
2016         }
2017     }
2018     return $letter;
2019 }
2020
2021
2022 =head2 consensus_iupac
2023
2024  Title     : consensus_iupac
2025  Usage     : $str = $ali->consensus_iupac()
2026  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
2027              and RNA. The output is in upper case except when gaps in
2028              a column force output to be in lower case.
2029
2030              Note that if your alignment sequences contain a lot of
2031              IUPAC ambiquity codes you often have to manually set
2032              alphabet.  Bio::PrimarySeq::_guess_type thinks they
2033              indicate a protein sequence.
2034  Returns   : consensus string
2035  Argument  : none
2036  Throws    : on protein sequences
2037
2038 =cut
2039
2040 sub consensus_iupac {
2041     my $self = shift;
2042     my $out = "";
2043     my $len = $self->length-1;
2044
2045     # only DNA and RNA sequences are valid
2046     foreach my $seq ( $self->each_seq() ) {
2047         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2048             if $seq->alphabet eq 'protein';
2049     }
2050     # loop over the alignment columns
2051     foreach my $count ( 0 .. $len ) {
2052         $out .= $self->_consensus_iupac($count);
2053     }
2054     return $out;
2055 }
2056
2057 sub _consensus_iupac {
2058     my ($self, $column) = @_;
2059     my ($string, $char, $rna);
2060
2061     #determine all residues in a column
2062     foreach my $seq ( $self->each_seq() ) {
2063         $string .= substr($seq->seq, $column, 1);
2064     }
2065     $string = uc $string;
2066
2067     # quick exit if there's an N in the string
2068     if ($string =~ /N/) {
2069         $string =~ /\W/ ? return 'n' : return 'N';
2070     }
2071     # ... or if there are only gap characters
2072     return '-' if $string =~ /^\W+$/;
2073
2074     # treat RNA as DNA in regexps
2075     if ($string =~ /U/) {
2076         $string =~ s/U/T/;
2077         $rna = 1;
2078     }
2079
2080     # the following s///'s only need to be done to the _first_ ambiguity code
2081     # as we only need to see the _range_ of characters in $string
2082
2083     if ($string =~ /[VDHB]/) {
2084         $string =~ s/V/AGC/;
2085         $string =~ s/D/AGT/;
2086         $string =~ s/H/ACT/;
2087         $string =~ s/B/CTG/;
2088     }
2089
2090     if ($string =~ /[SKYRWM]/) {
2091         $string =~ s/S/GC/;
2092         $string =~ s/K/GT/;
2093         $string =~ s/Y/CT/;
2094         $string =~ s/R/AG/;
2095         $string =~ s/W/AT/;
2096         $string =~ s/M/AC/;
2097     }
2098
2099     # and now the guts of the thing
2100
2101     if ($string =~ /A/) {
2102         $char = 'A';                     # A                      A
2103         if ($string =~ /G/) {
2104             $char = 'R';                 # A and G (purines)      R
2105             if ($string =~ /C/) {
2106                 $char = 'V';             # A and G and C          V
2107                 if ($string =~ /T/) {
2108                     $char = 'N';         # A and G and C and T    N
2109                 }
2110             } elsif ($string =~ /T/) {
2111                 $char = 'D';             # A and G and T          D
2112             }
2113         } elsif ($string =~ /C/) {
2114             $char = 'M';                 # A and C                M
2115             if ($string =~ /T/) {
2116                 $char = 'H';             # A and C and T          H
2117             }
2118         } elsif ($string =~ /T/) {
2119             $char = 'W';                 # A and T                W
2120         }
2121     } elsif ($string =~ /C/) {
2122         $char = 'C';                     # C                      C
2123         if ($string =~ /T/) {
2124             $char = 'Y';                 # C and T (pyrimidines)  Y
2125             if ($string =~ /G/) {
2126                 $char = 'B';             # C and T and G          B
2127             }
2128         } elsif ($string =~ /G/) {
2129             $char = 'S';                 # C and G                S
2130         }
2131     } elsif ($string =~ /G/) {
2132         $char = 'G';                     # G                      G
2133         if ($string =~ /C/) {
2134             $char = 'S';                 # G and C                S
2135         } elsif ($string =~ /T/) {
2136             $char = 'K';                 # G and T                K
2137         }
2138     } elsif ($string =~ /T/) {
2139         $char = 'T';                     # T                      T
2140     }
2141
2142     $char = 'U' if $rna and $char eq 'T';
2143     $char = lc $char if $string =~ /\W/;
2144
2145     return $char;
2146 }
2147
2148
2149 =head2 consensus_meta
2150
2151  Title     : consensus_meta
2152  Usage     : $seqmeta = $ali->consensus_meta()
2153  Function  : Returns a Bio::Seq::Meta object containing the consensus
2154              strings derived from meta data analysis.
2155  Returns   : Bio::Seq::Meta
2156  Argument  : Bio::Seq::Meta
2157  Throws    : non-MetaI object
2158
2159 =cut
2160
2161 sub consensus_meta {
2162     my ($self, $meta) = @_;
2163     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2164         $self->throw('Not a Bio::Seq::MetaI object');
2165     }
2166     return $self->{'_aln_meta'} = $meta if $meta;
2167     return $self->{'_aln_meta'}
2168 }
2169
2170 =head2 is_flush
2171
2172  Title     : is_flush
2173  Usage     : if ( $ali->is_flush() )
2174  Function  : Tells you whether the alignment
2175            : is flush, i.e. all of the same length
2176  Returns   : 1 or 0
2177  Argument  :
2178
2179 =cut
2180
2181 sub is_flush {
2182     my ($self,$report) = @_;
2183     my $seq;
2184     my $length = (-1);
2185     my $temp;
2186
2187     foreach $seq ( $self->each_seq() ) {
2188         if( $length == (-1) ) {
2189             $length = CORE::length($seq->seq());
2190             next;
2191         }
2192
2193         $temp = CORE::length($seq->seq());
2194         if( $temp != $length ) {
2195             $self->warn("expecting $length not $temp from ".
2196                         $seq->display_id) if( $report );
2197             $self->debug("expecting $length not $temp from ".
2198                          $seq->display_id);
2199             $self->debug($seq->seq(). "\n");
2200             return 0;
2201         }
2202     }
2203
2204     return 1;
2205 }
2206
2207
2208 =head2 length
2209
2210  Title     : length()
2211  Usage     : $len = $ali->length()
2212  Function  : Returns the maximum length of the alignment.
2213              To be sure the alignment is a block, use is_flush
2214  Returns   : Integer
2215  Argument  :
2216
2217 =cut
2218
2219 sub length_aln {
2220     my $self = shift;
2221     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2222     $self->length(@_);
2223 }
2224
2225 sub length {
2226     my $self = shift;
2227     my $seq;
2228     my $length = -1;
2229     my $temp;
2230
2231     foreach $seq ( $self->each_seq() ) {
2232         $temp = $seq->length();
2233         if( $temp > $length ) {
2234             $length = $temp;
2235         }
2236     }
2237
2238     return $length;
2239 }
2240
2241
2242 =head2 maxdisplayname_length
2243
2244  Title     : maxdisplayname_length
2245  Usage     : $ali->maxdisplayname_length()
2246  Function  : Gets the maximum length of the displayname in the
2247              alignment. Used in writing out various MSA formats.
2248  Returns   : integer
2249  Argument  :
2250
2251 =cut
2252
2253 sub maxname_length {
2254     my $self = shift;
2255     $self->deprecated("maxname_length - deprecated method.".
2256                       " Use maxdisplayname_length() instead.");
2257     $self->maxdisplayname_length();
2258 }
2259
2260 sub maxnse_length {
2261     my $self = shift;
2262     $self->deprecated("maxnse_length - deprecated method.".
2263                       " Use maxnse_length() instead.");
2264     $self->maxdisplayname_length();
2265 }
2266
2267 sub maxdisplayname_length {
2268     my $self = shift;
2269     my $maxname = (-1);
2270     my ($seq,$len);
2271
2272     foreach $seq ( $self->each_seq() ) {
2273         $len = CORE::length $self->displayname($seq->get_nse());
2274
2275         if( $len > $maxname ) {
2276             $maxname = $len;
2277         }
2278     }
2279
2280     return $maxname;
2281 }
2282
2283 =head2 max_metaname_length
2284
2285  Title     : max_metaname_length
2286  Usage     : $ali->max_metaname_length()
2287  Function  : Gets the maximum length of the meta name tags in the
2288              alignment for the sequences and for the alignment.
2289              Used in writing out various MSA formats.
2290  Returns   : integer
2291  Argument  : None
2292
2293 =cut
2294
2295 sub max_metaname_length {
2296     my $self = shift;
2297     my $maxname = (-1);
2298     my ($seq,$len);
2299
2300     # check seq meta first
2301     for $seq ( $self->each_seq() ) {
2302         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2303         for my $mtag ($seq->meta_names) {
2304             $len = CORE::length $mtag;
2305             if( $len > $maxname ) {
2306                 $maxname = $len;
2307             }
2308         }
2309     }
2310
2311     # alignment meta
2312     for my $meta ($self->consensus_meta) {
2313         next unless $meta;
2314         for my $name ($meta->meta_names) {
2315             $len = CORE::length $name;
2316             if( $len > $maxname ) {
2317                 $maxname = $len;
2318             }
2319         }
2320     }
2321
2322     return $maxname;
2323 }
2324
2325 =head2 no_residues
2326
2327  Title     : no_residues
2328  Usage     : $no = $ali->no_residues
2329  Function  : number of residues in total in the alignment
2330  Returns   : integer
2331  Argument  :
2332
2333 =cut
2334
2335 sub no_residues {
2336     my $self = shift;
2337     my $count = 0;
2338
2339     foreach my $seq ($self->each_seq) {
2340         my $str = $seq->seq();
2341
2342         $count += ($str =~ s/[A-Za-z]//g);
2343     }
2344
2345     return $count;
2346 }
2347
2348 =head2 no_sequences
2349
2350  Title     : no_sequences
2351  Usage     : $depth = $ali->no_sequences
2352  Function  : number of sequence in the sequence alignment
2353  Returns   : integer
2354  Argument  :
2355
2356 =cut
2357
2358 sub no_sequences {
2359     my $self = shift;
2360
2361     return scalar($self->each_seq);
2362 }
2363
2364
2365 =head2 average_percentage_identity
2366
2367  Title   : average_percentage_identity
2368  Usage   : $id = $align->average_percentage_identity
2369  Function: The function uses a fast method to calculate the average
2370            percentage identity of the alignment
2371  Returns : The average percentage identity of the alignment
2372  Args    : None
2373  Notes   : This method implemented by Kevin Howe calculates a figure that is
2374            designed to be similar to the average pairwise identity of the
2375            alignment (identical in the absence of gaps), without having to
2376            explicitly calculate pairwise identities proposed by Richard Durbin.
2377            Validated by Ewan Birney ad Alex Bateman.
2378
2379 =cut
2380
2381 sub average_percentage_identity{
2382    my ($self,@args) = @_;
2383
2384    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2385                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2386
2387    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2388
2389    if (! $self->is_flush()) {
2390        $self->throw("All sequences in the alignment must be the same length");
2391    }
2392
2393    @seqs = $self->each_seq();
2394    $len = $self->length();
2395
2396    # load the each hash with correct keys for existence checks
2397
2398    for( my $index=0; $index < $len; $index++) {
2399        foreach my $letter (@alphabet) {
2400            $countHashes[$index]->{$letter} = 0;
2401        }
2402    }
2403    foreach my $seq (@seqs)  {
2404        my @seqChars = split //, $seq->seq();
2405        for( my $column=0; $column < @seqChars; $column++ ) {
2406            my $char = uc($seqChars[$column]);
2407            if (exists $countHashes[$column]->{$char}) {
2408                $countHashes[$column]->{$char}++;
2409            }
2410        }
2411    }
2412
2413    $total = 0;
2414    $divisor = 0;
2415    for(my $column =0; $column < $len; $column++) {
2416        my %hash = %{$countHashes[$column]};
2417        $subdivisor = 0;
2418        foreach my $res (keys %hash) {
2419            $total += $hash{$res}*($hash{$res} - 1);
2420            $subdivisor += $hash{$res};
2421        }
2422        $divisor += $subdivisor * ($subdivisor - 1);
2423    }
2424    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2425 }
2426
2427 =head2 percentage_identity
2428
2429  Title   : percentage_identity
2430  Usage   : $id = $align->percentage_identity
2431  Function: The function calculates the average percentage identity
2432            (aliased to average_percentage_identity)
2433  Returns : The average percentage identity
2434  Args    : None
2435
2436 =cut
2437
2438 sub percentage_identity {
2439     my $self = shift;
2440     return $self->average_percentage_identity();
2441 }
2442
2443 =head2 overall_percentage_identity
2444
2445  Title   : overall_percentage_identity
2446  Usage   : $id = $align->overall_percentage_identity
2447            $id = $align->overall_percentage_identity('short')
2448  Function: The function calculates the percentage identity of
2449            the conserved columns
2450  Returns : The percentage identity of the conserved columns
2451  Args    : length value to use, optional defaults to alignment length
2452                  possible values: 'align', 'short', 'long'
2453
2454 The argument values 'short' and 'long' refer to shortest and longest
2455 sequence in the alignment. Method modification code by Hongyu Zhang.
2456
2457 =cut
2458
2459 sub overall_percentage_identity{
2460    my ($self, $length_measure) = @_;
2461
2462    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2463                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2464
2465    my ($len, $total, @seqs, @countHashes);
2466
2467    my %enum = map {$_ => 1} qw (align short long);
2468
2469    $self->throw("Unknown argument [$length_measure]")
2470        if $length_measure and not $enum{$length_measure};
2471    $length_measure ||= 'align';
2472
2473    if (! $self->is_flush()) {
2474        $self->throw("All sequences in the alignment must be the same length");
2475    }
2476
2477    @seqs = $self->each_seq();
2478    $len = $self->length();
2479
2480    # load the each hash with correct keys for existence checks
2481    for( my $index=0; $index < $len; $index++) {
2482        foreach my $letter (@alphabet) {
2483            $countHashes[$index]->{$letter} = 0;
2484        }
2485    }
2486    foreach my $seq (@seqs)  {
2487        my @seqChars = split //, $seq->seq();
2488        for( my $column=0; $column < @seqChars; $column++ ) {
2489            my $char = uc($seqChars[$column]);
2490            if (exists $countHashes[$column]->{$char}) {
2491                $countHashes[$column]->{$char}++;
2492            }
2493        }
2494    }
2495
2496    $total = 0;
2497    for(my $column =0; $column < $len; $column++) {
2498        my %hash = %{$countHashes[$column]};
2499        foreach ( values %hash ) {
2500            next if( $_ == 0 );
2501            $total++ if( $_ == scalar @seqs );
2502            last;
2503        }
2504    }
2505
2506    if ($length_measure eq 'short') {
2507        ## find the shortest length
2508        $len = 0;
2509        foreach my $seq ($self->each_seq) {
2510            my $count = $seq->seq =~ tr/[A-Za-z]//;
2511            if ($len) {
2512                $len = $count if $count < $len;
2513            } else {
2514                $len = $count;
2515            }
2516        }
2517    }
2518    elsif ($length_measure eq 'long') {
2519        ## find the longest length
2520        $len = 0;
2521        foreach my $seq ($self->each_seq) {
2522            my $count = $seq->seq =~ tr/[A-Za-z]//;
2523            if ($len) {
2524                $len = $count if $count > $len;
2525            } else {
2526                $len = $count;
2527            }
2528        }
2529    }
2530
2531    return ($total / $len ) * 100.0;
2532 }
2533
2534
2535
2536 =head1 Alignment positions
2537
2538 Methods to map a sequence position into an alignment column and back.
2539 column_from_residue_number() does the former. The latter is really a
2540 property of the sequence object and can done using
2541 L<Bio::LocatableSeq::location_from_column>:
2542
2543     # select somehow a sequence from the alignment, e.g.
2544     my $seq = $aln->get_seq_by_pos(1);
2545     #$loc is undef or Bio::LocationI object
2546     my $loc = $seq->location_from_column(5);
2547
2548 =head2 column_from_residue_number
2549
2550  Title   : column_from_residue_number
2551  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2552  Function: This function gives the position in the alignment
2553            (i.e. column number) of the given residue number in the
2554            sequence with the given name. For example, for the
2555            alignment
2556
2557              Seq1/91-97 AC..DEF.GH.
2558              Seq2/24-30 ACGG.RTY...
2559                 Seq3/43-51 AC.DDEF.GHI
2560
2561            column_from_residue_number( "Seq1", 94 ) returns 6.
2562            column_from_residue_number( "Seq2", 25 ) returns 2.
2563            column_from_residue_number( "Seq3", 50 ) returns 10.
2564
2565            An exception is thrown if the residue number would lie
2566            outside the length of the aligment
2567            (e.g. column_from_residue_number( "Seq2", 22 )
2568
2569           Note: If the the parent sequence is represented by more than
2570                 one alignment sequence and the residue number is present in
2571                 them, this method finds only the first one.
2572
2573  Returns : A column number for the position in the alignment of the
2574            given residue in the given sequence (1 = first column)
2575  Args    : A sequence id/name (not a name/start-end)
2576            A residue number in the whole sequence (not just that
2577            segment of it in the alignment)
2578
2579 =cut
2580
2581 sub column_from_residue_number {
2582     my ($self, $name, $resnumber) = @_;
2583
2584     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2585     $self->throw("Second argument residue number missing") unless $resnumber;
2586
2587     foreach my $seq ($self->each_seq_with_id($name)) {
2588         my $col;
2589         eval {
2590             $col = $seq->column_from_residue_number($resnumber);
2591         };
2592         next if $@;
2593         return $col;
2594     }
2595
2596     $self->throw("Could not find a sequence segment in $name ".
2597                  "containing residue number $resnumber");
2598
2599 }
2600
2601 =head1 Sequence names
2602
2603 Methods to manipulate the display name. The default name based on the
2604 sequence id and subsequence positions can be overridden in various
2605 ways.
2606
2607 =head2 displayname
2608
2609  Title     : displayname
2610  Usage     : $myalign->displayname("Ig", "IgA")
2611  Function  : Gets/sets the display name of a sequence in the alignment
2612  Returns   : A display name string
2613  Argument  : name of the sequence
2614              displayname of the sequence (optional)
2615
2616 =cut
2617
2618 sub get_displayname {
2619     my $self = shift;
2620     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2621     $self->displayname(@_);
2622 }
2623
2624 sub set_displayname {
2625     my $self = shift;
2626     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2627     $self->displayname(@_);
2628 }
2629
2630 sub displayname {
2631     my ($self, $name, $disname) = @_;
2632
2633     $self->throw("No sequence with name [$name]")
2634         unless defined $self->{'_seq'}->{$name};
2635
2636     if(  $disname and  $name) {
2637         $self->{'_dis_name'}->{$name} = $disname;
2638         return $disname;
2639     }
2640     elsif( defined $self->{'_dis_name'}->{$name} ) {
2641         return  $self->{'_dis_name'}->{$name};
2642     } else {
2643         return $name;
2644     }
2645 }
2646
2647 =head2 set_displayname_count
2648
2649  Title     : set_displayname_count
2650  Usage     : $ali->set_displayname_count
2651  Function  : Sets the names to be name_# where # is the number of
2652              times this name has been used.
2653  Returns   : 1, on success
2654  Argument  :
2655
2656 =cut
2657
2658 sub set_displayname_count {
2659     my $self= shift;
2660     my (@arr,$name,$seq,$count,$temp,$nse);
2661
2662     foreach $seq ( $self->each_alphabetically() ) {
2663         $nse = $seq->get_nse();
2664
2665         #name will be set when this is the second
2666         #time (or greater) is has been seen
2667
2668         if( defined $name and $name eq ($seq->id()) ) {
2669             $temp = sprintf("%s_%s",$name,$count);
2670             $self->displayname($nse,$temp);
2671             $count++;
2672         } else {
2673             $count = 1;
2674             $name = $seq->id();
2675             $temp = sprintf("%s_%s",$name,$count);
2676             $self->displayname($nse,$temp);
2677             $count++;
2678         }
2679     }
2680     return 1;
2681 }
2682
2683 =head2 set_displayname_flat
2684
2685  Title     : set_displayname_flat
2686  Usage     : $ali->set_displayname_flat()
2687  Function  : Makes all the sequences be displayed as just their name,
2688              not name/start-end
2689  Returns   : 1
2690  Argument  :
2691
2692 =cut
2693
2694 sub set_displayname_flat {
2695     my $self = shift;
2696     my ($nse,$seq);
2697
2698     foreach $seq ( $self->each_seq() ) {
2699         $nse = $seq->get_nse();
2700         $self->displayname($nse,$seq->id());
2701     }
2702     return 1;
2703 }
2704
2705 =head2 set_displayname_normal
2706
2707  Title     : set_displayname_normal
2708  Usage     : $ali->set_displayname_normal()
2709  Function  : Makes all the sequences be displayed as name/start-end
2710  Returns   : 1, on success
2711  Argument  :
2712
2713 =cut
2714
2715 sub set_displayname_normal {
2716     my $self = shift;
2717     my ($nse,$seq);
2718
2719     foreach $seq ( $self->each_seq() ) {
2720         $nse = $seq->get_nse();
2721         $self->displayname($nse,$nse);
2722     }
2723     return 1;
2724 }
2725
2726 =head2 source
2727
2728  Title   : source
2729  Usage   : $obj->source($newval)
2730  Function: sets the Alignment source program
2731  Example :
2732  Returns : value of source
2733  Args    : newvalue (optional)
2734
2735
2736 =cut
2737
2738 sub source{
2739    my ($self,$value) = @_;
2740    if( defined $value) {
2741       $self->{'_source'} = $value;
2742     }
2743     return $self->{'_source'};
2744 }
2745
2746 =head2 set_displayname_safe
2747
2748  Title     : set_displayname_safe
2749  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2750  Function  : Assign machine-generated serial names to sequences in input order.
2751              Designed to protect names during PHYLIP runs. Assign 10-char string
2752              in the form of "S000000001" to "S999999999". Restore the original
2753              names using "restore_displayname".
2754  Returns   : 1. a new $aln with system names;
2755              2. a hash ref for restoring names
2756  Argument  : Number for id length (default 10)
2757
2758 =cut
2759
2760 sub set_displayname_safe {
2761     my $self = shift;
2762     my $idlength = shift || 10;
2763     my ($seq, %phylip_name);
2764     my $ct=0;
2765     my $new=Bio::SimpleAlign->new();
2766     foreach $seq ( $self->each_seq() ) {
2767         $ct++;
2768         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2769         $phylip_name{$pname}=$seq->id();
2770         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2771                                             -seq      => $seq->seq(),
2772                                             -alphabet => $seq->alphabet,
2773                                             -start    => $seq->{_start},
2774                                             -end      => $seq->{_end}
2775                                             );
2776         $new->add_seq($new_seq);
2777     }
2778
2779     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2780     return ($new, \%phylip_name);
2781 }
2782
2783 =head2 restore_displayname
2784
2785  Title     : restore_displayname
2786  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2787  Function  : Restore original sequence names (after running
2788              $ali->set_displayname_safe)
2789  Returns   : a new $aln with names restored.
2790  Argument  : a hash reference of names from "set_displayname_safe".
2791
2792 =cut
2793
2794 sub restore_displayname {
2795     my $self = shift;
2796     my $ref=shift;
2797     my %name=%$ref;
2798     my $new=Bio::SimpleAlign->new();
2799     foreach my $seq ( $self->each_seq() ) {
2800       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2801       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2802                                           -seq      => $seq->seq(),
2803                                           -alphabet => $seq->alphabet,
2804                                           -start    => $seq->{_start},
2805                                           -end      => $seq->{_end}
2806                                           );
2807       $new->add_seq($new_seq);
2808     }
2809     return $new;
2810 }
2811
2812 =head2 sort_by_start
2813
2814  Title     : sort_by_start
2815  Usage     : $ali->sort_by_start
2816  Function  : Changes the order of the alignment to the start position of each
2817              subalignment
2818  Returns   :
2819  Argument  :
2820
2821 =cut
2822
2823 sub sort_by_start {
2824     my $self = shift;
2825     my ($seq,$nse,@arr,%hash,$count);
2826     foreach $seq ( $self->each_seq() ) {
2827         $nse = $seq->get_nse;
2828         $hash{$nse} = $seq;
2829     }
2830     $count = 0;
2831     %{$self->{'_order'}} = (); # reset the hash;
2832     foreach $nse ( sort _startend keys %hash) {
2833         $self->{'_order'}->{$count} = $nse;
2834         $count++;
2835     }
2836     1;
2837 }
2838
2839 sub _startend
2840 {
2841     my ($aname,$arange) = split (/[\/]/,$a);
2842     my ($bname,$brange) = split (/[\/]/,$b);
2843     my ($astart,$aend) = split(/\-/,$arange);
2844     my ($bstart,$bend) = split(/\-/,$brange);
2845     return $astart <=> $bstart;
2846 }
2847
2848 =head2 bracket_string
2849
2850  Title     : bracket_string
2851  Usage     : my @params = (-refseq     => 'testseq',
2852                            -allele1    => 'allele1',
2853                            -allele2    => 'allele2',
2854                            -delimiters => '{}',
2855                            -separator  => '/');
2856              $str = $aln->bracket_string(@params)
2857
2858  Function :  When supplied with a list of parameters (see below), returns a
2859              string in BIC format. This is used for allelic comparisons.
2860              Briefly, if either allele contains a base change when compared to
2861              the refseq, the base or gap for each allele is represented in
2862              brackets in the order present in the 'alleles' parameter.
2863
2864              For the following data:
2865
2866              >testseq
2867              GGATCCATTGCTACT
2868              >allele1
2869              GGATCCATTCCTACT
2870              >allele2
2871              GGAT--ATTCCTCCT
2872
2873              the returned string with parameters 'refseq => testseq' and
2874              'alleles => [qw(allele1 allele2)]' would be:
2875
2876              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2877  Returns   : BIC-formatted string
2878  Argument  : Required args
2879                 refseq    : string (ID) of the reference sequence used
2880                             as basis for comparison
2881                 allele1   : string (ID) of the first allele
2882                 allele2   : string (ID) of the second allele
2883              Optional args
2884                 delimiters: two symbol string of left and right delimiters.
2885                             Only the first two symbols are used
2886                             default = '[]'
2887                 separator : string used as a separator.  Only the first
2888                             symbol is used
2889                             default = '/'
2890  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2891
2892 =cut
2893
2894 sub bracket_string {
2895     my ($self, @args) = @_;
2896     my ($ref, $a1, $a2, $delim, $sep) =
2897         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2898     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2899     my ($ld, $rd);
2900     ($ld, $rd) = split('', $delim, 2) if $delim;
2901     $ld ||= '[';
2902     $rd ||= ']';
2903     $sep ||= '/';
2904     my ($refseq, $allele1, $allele2) =
2905         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2906     if (!$refseq || !$allele1 || !$allele2) {
2907         $self->throw("One of your refseq/allele IDs is invalid!");
2908     }
2909     my $len = $self->length-1;
2910     my $bic = '';
2911     # loop over the alignment columns
2912     for my $column ( 0 .. $len ) {
2913         my $string;
2914         my ($compres, $res1, $res2) =
2915             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2916         # are any of the allele symbols different from the refseq?
2917         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2918                 $ld.$res1.$sep.$res2.$rd;
2919         $bic .= $string;
2920     }
2921     return $bic;
2922 }
2923
2924
2925 =head2 methods for Bio::FeatureHolder
2926
2927 FeatureHolder implementation to support labeled character sets like one
2928 would get from NEXUS represented data.
2929
2930 =head2 get_SeqFeatures
2931
2932  Usage   :
2933  Function: Get the feature objects held by this feature holder.
2934  Example :
2935  Returns : an array of Bio::SeqFeatureI implementing objects
2936  Args    : none
2937
2938 At some day we may want to expand this method to allow for a feature
2939 filter to be passed in.
2940
2941 =cut
2942
2943 sub get_SeqFeatures {
2944     my $self = shift;
2945
2946     if( !defined $self->{'_as_feat'} ) {
2947         $self->{'_as_feat'} = [];
2948     }
2949     return @{$self->{'_as_feat'}};
2950 }
2951
2952 =head2 add_SeqFeature
2953
2954  Usage   : $feat->add_SeqFeature($subfeat);
2955            $feat->add_SeqFeature($subfeat,'EXPAND')
2956  Function: adds a SeqFeature into the subSeqFeature array.
2957            with no 'EXPAND' qualifer, subfeat will be tested
2958            as to whether it lies inside the parent, and throw
2959            an exception if not.
2960
2961            If EXPAND is used, the parent''s start/end/strand will
2962            be adjusted so that it grows to accommodate the new
2963            subFeature
2964  Example :
2965  Returns : nothing
2966  Args    : a Bio::SeqFeatureI object
2967
2968 =cut
2969
2970 sub add_SeqFeature {
2971    my ($self,@feat) = @_;
2972
2973    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2974
2975    foreach my $feat ( @feat ) {
2976        if( !$feat->isa("Bio::SeqFeatureI") ) {
2977            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2978        }
2979
2980        push(@{$self->{'_as_feat'}},$feat);
2981    }
2982    return 1;
2983 }
2984
2985
2986 =head2 remove_SeqFeatures
2987
2988  Usage   : $obj->remove_SeqFeatures
2989  Function: Removes all sub SeqFeatures.  If you want to remove only a subset,
2990            remove that subset from the returned array, and add back the rest.
2991  Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2992            deleted from this feature.
2993  Args    : none
2994
2995 =cut
2996
2997 sub remove_SeqFeatures {
2998     my $self = shift;
2999
3000     return () unless $self->{'_as_feat'};
3001     my @feats = @{$self->{'_as_feat'}};
3002     $self->{'_as_feat'} = [];
3003     return @feats;
3004 }
3005
3006 =head2 feature_count
3007
3008  Title   : feature_count
3009  Usage   : $obj->feature_count()
3010  Function: Return the number of SeqFeatures attached to a feature holder.
3011
3012            This is before flattening a possible sub-feature tree.
3013
3014            We provide a default implementation here that just counts
3015            the number of objects returned by get_SeqFeatures().
3016            Implementors may want to override this with a more
3017            efficient implementation.
3018
3019  Returns : integer representing the number of SeqFeatures
3020  Args    : None
3021
3022 At some day we may want to expand this method to allow for a feature
3023 filter to be passed in.
3024
3025 Our default implementation allows for any number of additional
3026 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
3027 support filter arguments, just support them in get_SeqFeatures().
3028
3029 =cut
3030
3031 sub feature_count {
3032     my ($self) = @_;
3033
3034     if (defined($self->{'_as_feat'})) {
3035         return ($#{$self->{'_as_feat'}} + 1);
3036     } else {
3037         return 0;
3038     }
3039 }
3040
3041 =head2 get_all_SeqFeatures
3042
3043  Title   : get_all_SeqFeatures
3044  Usage   :
3045  Function: Get the flattened tree of feature objects held by this
3046            feature holder. The difference to get_SeqFeatures is that
3047            the entire tree of sub-features will be flattened out.
3048
3049            We provide a default implementation here, so implementors
3050            don''t necessarily need to implement this method.
3051
3052  Example :
3053  Returns : an array of Bio::SeqFeatureI implementing objects
3054  Args    : none
3055
3056 At some day we may want to expand this method to allow for a feature
3057 filter to be passed in.
3058
3059 Our default implementation allows for any number of additional
3060 arguments and will pass them on to any invocation of
3061 get_SeqFeatures(), wherever a component of the tree implements
3062 FeatureHolderI. I.e., in order to support filter arguments, just
3063 support them in get_SeqFeatures().
3064
3065 =cut
3066
3067 =head2 methods for Bio::AnnotatableI
3068
3069 AnnotatableI implementation to support sequence alignments which
3070 contain annotation (NEXUS, Stockholm).
3071
3072 =head2 annotation
3073
3074  Title   : annotation
3075  Usage   : $ann = $aln->annotation or
3076            $aln->annotation($ann)
3077  Function: Gets or sets the annotation
3078  Returns : Bio::AnnotationCollectionI object
3079  Args    : None or Bio::AnnotationCollectionI object
3080
3081 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3082 for more information
3083
3084 =cut
3085
3086 sub annotation {
3087     my ($obj,$value) = @_;
3088     if( defined $value ) {
3089         $obj->throw("object of class ".ref($value)." does not implement ".
3090                 "Bio::AnnotationCollectionI. Too bad.")
3091             unless $value->isa("Bio::AnnotationCollectionI");
3092         $obj->{'_annotation'} = $value;
3093     } elsif( ! defined $obj->{'_annotation'}) {
3094         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3095     }
3096     return $obj->{'_annotation'};
3097 }
3098
3099
3100 1;