Bio/SimpleAlign.pm

   1 # $Id$
   2 # BioPerl module for SimpleAlign
   3 #
   4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   5 #
   6 # Copyright Ewan Birney
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 # POD documentation - main docs before the code
  11 #
  12 #  History:
  13 #       11/3/00 Added threshold feature to consensus and consensus_aa  - PS
  14 #       May 2001 major rewrite - Heikki Lehvaslaiho
  15
  16 =head1 NAME
  17
  18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
  19
  20 =head1 SYNOPSIS
  21
  22   # Use Bio::AlignIO to read in the alignment
  23   $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
  24   $aln = $str->next_aln();
  25
  26   # Describe
  27   print $aln->length;
  28   print $aln->no_residues;
  29   print $aln->is_flush;
  30   print $aln->no_sequences;
  31   print $aln->score;
  32   print $aln->percentage_identity;
  33   print $aln->consensus_string(50);
  34
  35   # Find the position in the alignment for a sequence location
  36   $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
  37
  38   # Extract sequences and check values for the alignment column $pos
  39   foreach $seq ($aln->each_seq) {
  40       $res = $seq->subseq($pos, $pos);
  41       $count{$res}++;
  42   }
  43   foreach $res (keys %count) {
  44       printf "Res: %s  Count: %2d\n", $res, $count{$res};
  45   }
  46
  47   # Manipulate
  48   $aln->remove_seq($seq);
  49   $mini_aln = $aln->slice(20,30);  # get a block of columns
  50   $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
  51   $new_aln = $aln->remove_columns([20,30]); # remove by position
  52   $new_aln = $aln->remove_columns(['mismatch']); # remove by property
  53
  54   # Analyze
  55   $str = $aln->consensus_string($threshold_percent);
  56   $str = $aln->match_line();
  57   $str = $aln->cigar_line()
  58   $id = $aln->percentage_identity;
  59
  60 See the module documentation for details and more methods.
  61
  62 =head1 DESCRIPTION
  63
  64 SimpleAlign is an object that handles a multiple sequence alignment
  65 (MSA). It is very permissive of types (it does not insist on sequences
  66 being all same length, for example). Think of it as a set of sequences
  67 with a whole series of built-in manipulations and methods for reading and
  68 writing alignments.
  69
  70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
  71 to store its sequences. These are subsequences with a start and end
  72 positions in the parent reference sequence. Each sequence in the
  73 SimpleAlign object is a Bio::LocatableSeq.
  74
  75 SimpleAlign expects the combination of name, start, and end for a
  76 given sequence to be unique in the alignment, and this is the key for the
  77 internal hashes (name, start, end are abbreviated C<nse> in the code).
  78 However, in some cases people do not want the name/start-end to be displayed:
  79 either multiple names in an alignment or names specific to the alignment
  80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  81 C<displayname>, and generally is what is used to print out the
  82 alignment. They default to name/start-end.
  83
  84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
  85
  86 =head1 FEEDBACK
  87
  88 =head2 Mailing Lists
  89
  90 User feedback is an integral part of the evolution of this and other
  91 Bioperl modules.  Send your comments and suggestions preferably to one
  92 of the Bioperl mailing lists.  Your participation is much appreciated.
  93
  94   bioperl-l@bioperl.org                  - General discussion
  95   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via the
 101 web:
 102
 103   http://bugzilla.open-bio.org/
 104
 105 =head1 AUTHOR
 106
 107 Ewan Birney, birney@ebi.ac.uk
 108
 109 =head1 CONTRIBUTORS
 110
 111 Allen Day, allenday-at-ucla.edu,
 112 Richard Adams, Richard.Adams-at-ed.ac.uk,
 113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
 114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
 115 Allen Smith, allens-at-cpan.org,
 116 Jason Stajich, jason-at-bioperl.org,
 117 Anthony Underwood, aunderwood-at-phls.org.uk,
 118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
 119 Brian Osborne, bosborne at alum.mit.edu
 120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
 121 Hongyu Zhang, forward at hongyu.org
 122
 123 =head1 SEE ALSO
 124
 125 L<Bio::LocatableSeq>
 126
 127 =head1 APPENDIX
 128
 129 The rest of the documentation details each of the object
 130 methods. Internal methods are usually preceded with a _
 131
 132 =cut
 133
 134 # 'Let the code begin...
 135
 136 package Bio::SimpleAlign;
 137 use vars qw(%CONSERVATION_GROUPS);
 138 use strict;
 139
 140 use Bio::LocatableSeq;  # uses Seq's as list
 141
 142 use Bio::Seq;
 143 use Bio::SeqFeature::Generic;
 144
 145 BEGIN {
 146     # This data should probably be in a more centralized module...
 147     # it is taken from Clustalw documentation.
 148     # These are all the positively scoring groups that occur in the
 149     # Gonnet Pam250 matrix. The strong and weak groups are
 150     # defined as strong score >0.5 and weak score =<0.5 respectively.
 151
 152     %CONSERVATION_GROUPS = (
 153             'strong' => [ qw(
 154                                                  STA
 155                                                  NEQK
 156                                                  NHQK
 157                                                  NDEQ
 158                                                  QHRK
 159                                                  MILV
 160                                                  MILF
 161                                                  HY
 162                                                  FYW )],
 163                                 'weak' => [ qw(
 164                       CSA
 165                                                ATV
 166                                                SAG
 167                                                STNK
 168                                                STPA
 169                                                SGND
 170                                                SNDEQK
 171                                                NDEQHK
 172                                                NEQHRK
 173                                                FVLIM
 174                                                HFY )],);
 175 }
 176
 177 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
 178             Bio::FeatureHolderI);
 179
 180 =head2 new
 181
 182  Title     : new
 183  Usage     : my $aln = Bio::SimpleAlign->new();
 184  Function  : Creates a new simple align object
 185  Returns   : Bio::SimpleAlign
 186  Args      : -source => string representing the source program
 187                         where this alignment came from
 188
 189 =cut
 190
 191
 192 sub new {
 193   my($class,@args) = @_;
 194
 195   my $self = $class->SUPER::new(@args);
 196
 197   my ($src,$score,$id) = $self->_rearrange([qw(SOURCE SCORE ID)], @args);
 198   $src && $self->source($src);
 199   defined $score && $self->score($score);
 200   # we need to set up internal hashs first!
 201
 202   $self->{'_seq'} = {};
 203   $self->{'_order'} = {};
 204   $self->{'_start_end_lists'} = {};
 205   $self->{'_dis_name'} = {};
 206   $self->{'_id'} = 'NoName';
 207   # maybe we should automatically read in from args. Hmmm...
 208   $id && $self->id($id);
 209
 210   return $self; # success - we hope!
 211 }
 212
 213 =head1 Modifier methods
 214
 215 These methods modify the MSA by adding, removing or shuffling complete
 216 sequences.
 217
 218 =head2 add_seq
 219
 220  Title     : add_seq
 221  Usage     : $myalign->add_seq($newseq);
 222  Function  : Adds another sequence to the alignment. *Does not* align
 223              it - just adds it to the hashes.
 224  Returns   : nothing
 225  Args      : a Bio::LocatableSeq object
 226              order (optional)
 227
 228 See L<Bio::LocatableSeq> for more information
 229
 230 =cut
 231
 232 sub addSeq {
 233     my $self = shift;
 234     $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
 235     $self->add_seq(@_);
 236 }
 237
 238 sub add_seq {
 239     my $self = shift;
 240     my $seq  = shift;
 241     my $order = shift;
 242     my ($name,$id,$start,$end);
 243
 244     if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
 245         $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
 246     }
 247
 248     $id = $seq->id() ||$seq->display_id || $seq->primary_id;
 249     $start = $seq->start();
 250     $end  = $seq->end();
 251
 252     # build the symbol list for this sequence,
 253     # will prune out the gap and missing/match chars
 254     # when actually asked for the symbol list in the
 255     # symbol_chars
 256     # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
 257
 258     if( !defined $order ) {
 259         $order = keys %{$self->{'_seq'}};
 260     }
 261     $name = sprintf("%s/%d-%d",$id,$start,$end);
 262
 263     if( $self->{'_seq'}->{$name} ) {
 264         $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
 265     }
 266     else {
 267         $self->debug( "Assigning $name to $order\n");
 268
 269         $self->{'_order'}->{$order} = $name;
 270
 271         unless( exists( $self->{'_start_end_lists'}->{$id})) {
 272             $self->{'_start_end_lists'}->{$id} = [];
 273         }
 274         push @{$self->{'_start_end_lists'}->{$id}}, $seq;
 275     }
 276
 277     $self->{'_seq'}->{$name} = $seq;
 278
 279 }
 280
 281
 282 =head2 remove_seq
 283
 284  Title     : remove_seq
 285  Usage     : $aln->remove_seq($seq);
 286  Function  : Removes a single sequence from an alignment
 287  Returns   :
 288  Argument  : a Bio::LocatableSeq object
 289
 290 =cut
 291
 292 sub removeSeq {
 293     my $self = shift;
 294     $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
 295     $self->remove_seq(@_);
 296 }
 297
 298 sub remove_seq {
 299     my $self = shift;
 300     my $seq = shift;
 301     my ($name,$id,$start,$end);
 302
 303     $self->throw("Need Bio::Locatable seq argument ")
 304         unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
 305
 306     $id = $seq->id();
 307     $start = $seq->start();
 308     $end  = $seq->end();
 309     $name = sprintf("%s/%d-%d",$id,$start,$end);
 310
 311     if( !exists $self->{'_seq'}->{$name} ) {
 312         $self->throw("Sequence $name does not exist in the alignment to remove!");
 313     }
 314
 315     delete $self->{'_seq'}->{$name};
 316
 317     # we need to remove this seq from the start_end_lists hash
 318
 319     if (exists $self->{'_start_end_lists'}->{$id}) {
 320         # we need to find the sequence in the array.
 321
 322         my ($i, $found);;
 323         for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
 324             if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
 325                 $found = 1;
 326                 last;
 327             }
 328         }
 329         if ($found) {
 330             splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
 331         }
 332         else {
 333             $self->throw("Could not find the sequence to remoce from the start-end list");
 334         }
 335     }
 336     else {
 337         $self->throw("There is no seq list for the name $id");
 338     }
 339     # we need to shift order hash
 340     my %rev_order = reverse %{$self->{'_order'}};
 341     my $no = $rev_order{$name};
 342     my $no_sequences = $self->no_sequences;
 343     for (; $no < $no_sequences; $no++) {
 344        $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
 345     }
 346     delete $self->{'_order'}->{$no};
 347     return 1;
 348 }
 349
 350
 351 =head2 purge
 352
 353  Title   : purge
 354  Usage   : $aln->purge(0.7);
 355  Function: Removes sequences above given sequence similarity
 356            This function will grind on large alignments. Beware!
 357  Example :
 358  Returns : An array of the removed sequences
 359  Args    : float, threshold for similarity
 360
 361 =cut
 362
 363 sub purge {
 364         my ($self,$perc) = @_;
 365         my (%duplicate, @dups);
 366
 367         my @seqs = $self->each_seq();
 368
 369         for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
 370                 my $seq = $seqs[$i];
 371
 372                 #skip if already in duplicate hash
 373                 next if exists $duplicate{$seq->display_id} ;
 374                 my $one = $seq->seq();
 375
 376                 my @one = split '', $one;       #split to get 1aa per array element
 377
 378                 for (my $j=$i+1;$j < @seqs;$j++) {
 379                         my $seq2 = $seqs[$j];
 380
 381                         #skip if already in duplicate hash
 382                         next if exists $duplicate{$seq2->display_id} ;
 383
 384                         my $two = $seq2->seq();
 385                         my @two = split '', $two;
 386
 387                         my $count = 0;
 388                         my $res = 0;
 389                         for (my $k=0;$k<@one;$k++) {
 390                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 391                                           $one[$k] eq $two[$k]) {
 392                                         $count++;
 393                                 }
 394                                 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
 395                                           $two[$k] ne '.' && $two[$k] ne '-' ) {
 396                                         $res++;
 397                                 }
 398                         }
 399
 400                         my $ratio = 0;
 401                         $ratio = $count/$res unless $res == 0;
 402
 403                         # if above threshold put in duplicate hash and push onto
 404                         # duplicate array for returning to get_unique
 405                         if ( $ratio > $perc ) {
 406                                 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
 407                                 $duplicate{$seq2->display_id} = 1;
 408                                 push @dups, $seq2;
 409                         }
 410                 }
 411         }
 412         foreach my $seq (@dups) {
 413                 $self->remove_seq($seq);
 414         }
 415         return @dups;
 416 }
 417
 418 =head2 sort_alphabetically
 419
 420  Title     : sort_alphabetically
 421  Usage     : $ali->sort_alphabetically
 422  Function  : Changes the order of the alignment to alphabetical on name
 423              followed by numerical by number.
 424  Returns   :
 425  Argument  :
 426
 427 =cut
 428
 429 sub sort_alphabetically {
 430     my $self = shift;
 431     my ($seq,$nse,@arr,%hash,$count);
 432
 433     foreach $seq ( $self->each_seq() ) {
 434         $nse = $seq->get_nse;
 435         $hash{$nse} = $seq;
 436     }
 437
 438     $count = 0;
 439
 440     %{$self->{'_order'}} = (); # reset the hash;
 441
 442     foreach $nse ( sort _alpha_startend keys %hash) {
 443         $self->{'_order'}->{$count} = $nse;
 444
 445         $count++;
 446     }
 447     1;
 448 }
 449
 450 =head2 sort_by_list
 451
 452  Title     : sort_by_list
 453  Usage     : $aln_ordered=$aln->sort_by_list($list_file)
 454  Function  : Arbitrarily order sequences in an alignment
 455  Returns   : A new Bio::SimpleAlign object
 456  Argument  : a file listing sequence names in intended order (one name per line)
 457
 458 =cut
 459
 460 sub sort_by_list {
 461     my ($self, $list) = @_;
 462     my (@seq, @ids, %order);
 463
 464     foreach my $seq ( $self->each_seq() ) {
 465         push @seq, $seq;
 466         push @ids, $seq->display_id;
 467     }
 468
 469     my $ct=1;
 470     open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
 471     while (<$listfh>) {
 472       chomp;
 473       my $name=$_;
 474       $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
 475       $order{$name}=$ct++;
 476     }
 477     close($listfh);
 478
 479     # use the map-sort-map idiom:
 480     my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
 481     my $aln = $self->new;
 482     foreach (@sorted) { $aln->add_seq($_) }
 483     return $aln;
 484 }
 485
 486 =head2 set_new_reference
 487
 488  Title     : set_new_reference
 489  Usage     : $aln->set_new_reference(3 or 'B31'):  Select the 3rd sequence, or
 490              the sequence whoes name is "B31" (full, exact, and case-sensitive),
 491              as the reference (1st) sequence
 492  Function  : Change/Set a new reference (i.e., the first) sequence
 493  Returns   : a new Bio::SimpleAlign object.
 494              Throws an exception if designated sequence not found
 495  Argument  : a positive integer of sequence order, or a sequence name
 496              in the original alignment
 497
 498 =cut
 499
 500 sub set_new_reference {
 501     my ($self, $seqid) = @_;
 502     my $aln = $self->new;
 503     my (@seq, @ids, @new_seq);
 504     my $is_num=0;
 505     foreach my $seq ( $self->each_seq() ) {
 506         push @seq, $seq;
 507         push @ids, $seq->display_id;
 508     }
 509
 510     if ($seqid =~ /^\d+$/) { # argument is seq position
 511         $is_num=1;
 512         $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
 513     } else { # argument is a seq name
 514         $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
 515     }
 516
 517     for (my $i=0; $i<=$#seq; $i++) {
 518         my $pos=$i+1;
 519         if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
 520             unshift @new_seq, $seq[$i];
 521         } else {
 522             push @new_seq, $seq[$i];
 523         }
 524     }
 525     foreach (@new_seq) { $aln->add_seq($_);  }
 526     return $aln;
 527 }
 528
 529 sub _in_aln {  # check if input name exists in the alignment
 530     my ($str, $ref) = @_;
 531     foreach (@$ref) {
 532         return 1 if $str eq $_;
 533     }
 534     return 0;
 535 }
 536
 537
 538 =head2 uniq_seq
 539
 540  Title     : uniq_seq
 541  Usage     : $aln->uniq_seq():  Remove identical sequences in
 542              in the alignment.  Ambiguous base ("N", "n") and
 543              leading and ending gaps ("-") are NOT counted as
 544              differences.
 545  Function  : Make a new alignment of unique sequence types (STs)
 546  Returns   : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
 547              2. ST of each sequence in STDERR
 548  Argument  : None
 549
 550 =cut
 551
 552 sub uniq_seq {
 553     my ($self, $seqid) = @_;
 554     my $aln = $self->new;
 555     my (%member, %order, @seq, @uniq_str);
 556     my $order=0;
 557     my $len = $self->length();
 558     foreach my $seq ( $self->each_seq() ) {
 559         my $str = $seq->seq();
 560
 561 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
 562 # comparing two sequence strings
 563
 564 # 1st, convert "n", "N" to "?" (for DNA sequence only):
 565         $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
 566 # 2nd, convert leading and ending gaps to "?":
 567         $str = &_convert_leading_ending_gaps($str, '-', '?');
 568         my $new = Bio::LocatableSeq->new(-id      => $seq->id(),
 569                                          -alphabet=> $seq->alphabet,
 570                                          -seq     => $str,
 571                                          -start   => $seq->start,
 572                                          -end     => $seq->end
 573                                          );
 574         push @seq, $new;
 575     }
 576
 577     foreach my $seq (@seq) {
 578         my $str = $seq->seq();
 579         my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
 580         if ($seen) { # seen before
 581             my @memb = @{$member{$key}};
 582             push @memb, $seq;
 583             $member{$key} = \@memb;
 584         } else {  # not seen
 585             push @uniq_str, $key;
 586             $order++;
 587             $member{$key} = [ ($seq) ];
 588             $order{$key} = $order;
 589         }
 590     }
 591
 592     foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
 593 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
 594         my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
 595 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
 596         $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
 597         my $gap='-';
 598         my $end=length($str2);
 599         $end -= length($1) while $str2 =~ m/($gap+)/g;
 600         my $new = Bio::LocatableSeq->new(-id   =>"ST".$order{$str},
 601                                          -seq  =>$str2,
 602                                          -start=>1,
 603                                          -end  =>$end
 604                                          );
 605         $aln->add_seq($new);
 606 #       print STDERR "ST".$order{$str}, "\t=>";
 607         foreach (@{$member{$str}}) {
 608         $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
 609     }
 610 #       print STDERR "\n";
 611     }
 612     return $aln;
 613 }
 614
 615 sub _check_uniq {  # check if same seq exists in the alignment
 616     my ($str1, $ref, $length) = @_;
 617     my @char1=split //, $str1;
 618     my @array=@$ref;
 619
 620     return (0, $str1) if @array==0; # not seen (1st sequence)
 621
 622     foreach my $str2 (@array) {
 623         my $diff=0;
 624         my @char2=split //, $str2;
 625         for (my $i=0; $i<=$length-1; $i++) {
 626             next if $char1[$i] eq '?';
 627             next if $char2[$i] eq '?';
 628             $diff++ if $char1[$i] ne $char2[$i];
 629         }
 630         return (1, $str2) if $diff == 0;  # seen before
 631     }
 632
 633     return (0, $str1); # not seen
 634 }
 635
 636 sub _convert_leading_ending_gaps {
 637     my $s=shift;
 638     my $sym1=shift;
 639     my $sym2=shift;
 640     my @array=split //, $s;
 641 # convert leading char:
 642     for (my $i=0; $i<=$#array; $i++) {
 643         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 644     }
 645 # convert ending char:
 646     for (my $i = $#array; $i>= 0; $i--) {
 647         ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
 648     }
 649     my $s_new=join '', @array;
 650     return $s_new;
 651 }
 652
 653 =head1 Sequence selection methods
 654
 655 Methods returning one or more sequences objects.
 656
 657 =head2 each_seq
 658
 659  Title     : each_seq
 660  Usage     : foreach $seq ( $align->each_seq() )
 661  Function  : Gets a Seq object from the alignment
 662  Returns   : Seq object
 663  Argument  :
 664
 665 =cut
 666
 667 sub eachSeq {
 668     my $self = shift;
 669     $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
 670     $self->each_seq();
 671 }
 672
 673 sub each_seq {
 674         my $self = shift;
 675         my (@arr,$order);
 676
 677         foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
 678                 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
 679                         push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
 680                 }
 681         }
 682         return @arr;
 683 }
 684
 685
 686 =head2 each_alphabetically
 687
 688  Title     : each_alphabetically
 689  Usage     : foreach $seq ( $ali->each_alphabetically() )
 690  Function  : Returns a sequence object, but the objects are returned
 691              in alphabetically sorted order.
 692              Does not change the order of the alignment.
 693  Returns   : Seq object
 694  Argument  :
 695
 696 =cut
 697
 698 sub each_alphabetically {
 699         my $self = shift;
 700         my ($seq,$nse,@arr,%hash,$count);
 701
 702         foreach $seq ( $self->each_seq() ) {
 703                 $nse = $seq->get_nse;
 704                 $hash{$nse} = $seq;
 705         }
 706
 707         foreach $nse ( sort _alpha_startend keys %hash) {
 708                 push(@arr,$hash{$nse});
 709         }
 710         return @arr;
 711 }
 712
 713 sub _alpha_startend {
 714     my ($aname,$astart,$bname,$bstart);
 715     ($aname,$astart) = split (/-/,$a);
 716     ($bname,$bstart) = split (/-/,$b);
 717
 718     if( $aname eq $bname ) {
 719         return $astart <=> $bstart;
 720     }
 721     else {
 722         return $aname cmp $bname;
 723     }
 724 }
 725
 726 =head2 each_seq_with_id
 727
 728  Title     : each_seq_with_id
 729  Usage     : foreach $seq ( $align->each_seq_with_id() )
 730  Function  : Gets a Seq objects from the alignment, the contents
 731              being those sequences with the given name (there may be
 732              more than one)
 733  Returns   : Seq object
 734  Argument  : a seq name
 735
 736 =cut
 737
 738 sub eachSeqWithId {
 739     my $self = shift;
 740     $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
 741     $self->each_seq_with_id(@_);
 742 }
 743
 744 sub each_seq_with_id {
 745     my $self = shift;
 746     my $id = shift;
 747
 748     $self->throw("Method each_seq_with_id needs a sequence name argument")
 749         unless defined $id;
 750
 751     my (@arr, $seq);
 752
 753     if (exists($self->{'_start_end_lists'}->{$id})) {
 754         @arr = @{$self->{'_start_end_lists'}->{$id}};
 755     }
 756     return @arr;
 757 }
 758
 759 =head2 get_seq_by_pos
 760
 761  Title     : get_seq_by_pos
 762  Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
 763  Function  : Gets a sequence based on its position in the alignment.
 764              Numbering starts from 1.  Sequence positions larger than
 765              no_sequences() will thow an error.
 766  Returns   : a Bio::LocatableSeq object
 767  Args      : positive integer for the sequence position
 768
 769 =cut
 770
 771 sub get_seq_by_pos {
 772
 773     my $self = shift;
 774     my ($pos) = @_;
 775
 776     $self->throw("Sequence position has to be a positive integer, not [$pos]")
 777         unless $pos =~ /^\d+$/ and $pos > 0;
 778     $self->throw("No sequence at position [$pos]")
 779         unless $pos <= $self->no_sequences ;
 780
 781     my $nse = $self->{'_order'}->{--$pos};
 782     return $self->{'_seq'}->{$nse};
 783 }
 784
 785 =head2 get_seq_by_id
 786
 787  Title     : get_seq_by_id
 788  Usage     : $seq = $aln->get_seq_by_id($name) # seq named $name
 789  Function  : Gets a sequence based on its name.
 790              Sequences that do not exist will warn and return undef
 791  Returns   : a Bio::LocatableSeq object
 792  Args      : string for sequence name
 793
 794 =cut
 795
 796 sub get_seq_by_id {
 797     my ($self,$name) = @_;
 798     unless( defined $name ) {
 799       $self->warn("Must provide a sequence name");
 800       return undef;
 801     }
 802     for my $seq ( values %{$self->{'_seq'}} ) {
 803       if ( $seq->id eq $name) {
 804         return $seq;
 805       }
 806     }
 807     return undef;
 808 }
 809
 810 =head2 seq_with_features
 811
 812  Title   : seq_with_features
 813  Usage   : $seq = $aln->seq_with_features(-pos => 1,
 814                                           -consensus => 60
 815                                           -mask =>
 816            sub { my $consensus = shift;
 817
 818                  for my $i (1..5){
 819                     my $n = 'N' x $i;
 820                     my $q = '\?' x $i;
 821                     while($consensus =~ /[^?]$q[^?]/){
 822                        $consensus =~ s/([^?])$q([^?])/$1$n$2/;
 823                     }
 824                   }
 825                  return $consensus;
 826                }
 827                                          );
 828  Function: produces a Bio::Seq object by first splicing gaps from -pos
 829            (by means of a splice_by_seq_pos() call), then creating
 830            features using non-? chars (by means of a consensus_string()
 831            call with stringency -consensus).
 832  Returns : a Bio::Seq object
 833  Args    : -pos : required. sequence from which to build the Bio::Seq
 834              object
 835            -consensus : optional, defaults to consensus_string()'s
 836              default cutoff value
 837            -mask : optional, a coderef to apply to consensus_string()'s
 838              output before building features.  this may be useful for
 839              closing gaps of 1 bp by masking over them with N, for
 840              instance
 841
 842 =cut
 843
 844 sub seq_with_features{
 845    my ($self,%arg) = @_;
 846
 847    #first do the preparatory splice
 848    $self->throw("must provide a -pos argument") unless $arg{-pos};
 849    $self->splice_by_seq_pos($arg{-pos});
 850
 851    my $consensus_string = $self->consensus_string($arg{-consensus});
 852    $consensus_string = $arg{-mask}->($consensus_string)
 853          if defined($arg{-mask});
 854
 855    my(@bs,@es);
 856
 857    push @bs, 1 if $consensus_string =~ /^[^?]/;
 858
 859    while($consensus_string =~ /\?[^?]/g){
 860          push @bs, pos($consensus_string);
 861    }
 862    while($consensus_string =~ /[^?]\?/g){
 863          push @es, pos($consensus_string);
 864    }
 865
 866    push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
 867
 868    my $seq = Bio::Seq->new();
 869
 870 #   my $rootfeature = Bio::SeqFeature::Generic->new(
 871 #                -source_tag => 'location',
 872 #                -start      => $self->get_seq_by_pos($arg{-pos})->start,
 873 #                -end        => $self->get_seq_by_pos($arg{-pos})->end,
 874 #                                                  );
 875 #   $seq->add_SeqFeature($rootfeature);
 876
 877    while(my $b = shift @bs){
 878          my $e = shift @es;
 879          $seq->add_SeqFeature(
 880        Bio::SeqFeature::Generic->new(
 881          -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 882          -end   => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
 883          -source_tag => $self->source || 'MSA',
 884        )
 885      );
 886    }
 887
 888    return $seq;
 889 }
 890
 891
 892 =head1 Create new alignments
 893
 894 The result of these methods are horizontal or vertical subsets of the
 895 current MSA.
 896
 897 =head2 select
 898
 899  Title     : select
 900  Usage     : $aln2 = $aln->select(1, 3) # three first sequences
 901  Function  : Creates a new alignment from a continuous subset of
 902              sequences.  Numbering starts from 1.  Sequence positions
 903              larger than no_sequences() will thow an error.
 904  Returns   : a Bio::SimpleAlign object
 905  Args      : positive integer for the first sequence
 906              positive integer for the last sequence to include (optional)
 907
 908 =cut
 909
 910 sub select {
 911     my $self = shift;
 912     my ($start, $end) = @_;
 913
 914     $self->throw("Select start has to be a positive integer, not [$start]")
 915         unless $start =~ /^\d+$/ and $start > 0;
 916     $self->throw("Select end has to be a positive integer, not [$end]")
 917         unless $end  =~ /^\d+$/ and $end > 0;
 918     $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
 919         unless $start <= $end;
 920
 921     my $aln = $self->new;
 922     foreach my $pos ($start .. $end) {
 923         $aln->add_seq($self->get_seq_by_pos($pos));
 924     }
 925     $aln->id($self->id);
 926     # fix for meta, sf, ann
 927     return $aln;
 928 }
 929
 930 =head2 select_noncont
 931
 932  Title     : select_noncont
 933  Usage     : # 1st and 3rd sequences, sorted
 934              $aln2 = $aln->select_noncont(1, 3)
 935
 936              # 1st and 3rd sequences, sorted (same as first)
 937              $aln2 = $aln->select_noncont(3, 1)
 938
 939              # 1st and 3rd sequences, unsorted
 940              $aln2 = $aln->select_noncont('nosort',3, 1)
 941
 942  Function  : Creates a new alignment from a subset of sequences.  Numbering
 943              starts from 1.  Sequence positions larger than no_sequences() will
 944              throw an error.  Sorts the order added to new alignment by default,
 945              to prevent sorting pass 'nosort' as the first argument in the list.
 946  Returns   : a Bio::SimpleAlign object
 947  Args      : array of integers for the sequences.  If the string 'nosort' is
 948              passed as the first argument, the sequences will not be sorted
 949              in the new alignment but will appear in the order listed.
 950
 951 =cut
 952
 953 sub select_noncont {
 954         my $self = shift;
 955     my $nosort = 0;
 956         my (@pos) = @_;
 957     if ($pos[0] !~ m{^\d+$}) {
 958         my $sortcmd = shift @pos;
 959         if ($sortcmd eq 'nosort') {
 960             $nosort = 1;
 961         } else {
 962             $self->throw("Command not recognized: $sortcmd.  Only 'nosort' implemented at this time.");
 963         }
 964     }
 965
 966     my $end = $self->no_sequences;
 967     foreach ( @pos ) {
 968                 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
 969                   unless( /^\d+$/ && $_ > 0 && $_ <= $end );
 970         }
 971
 972         @pos = sort {$a <=> $b} @pos unless $nosort;
 973
 974         my $aln = $self->new;
 975         foreach my $p (@pos) {
 976                 $aln->add_seq($self->get_seq_by_pos($p));
 977         }
 978         $aln->id($self->id);
 979     # fix for meta, sf, ann
 980         return $aln;
 981 }
 982
 983 =head2 slice
 984
 985  Title     : slice
 986  Usage     : $aln2 = $aln->slice(20,30)
 987  Function  : Creates a slice from the alignment inclusive of start and
 988              end columns, and the first column in the alignment is denoted 1.
 989              Sequences with no residues in the slice are excluded from the
 990              new alignment and a warning is printed. Slice beyond the length of
 991              the sequence does not do padding.
 992  Returns   : A Bio::SimpleAlign object
 993  Args      : Positive integer for start column, positive integer for end column,
 994              optional boolean which if true will keep gap-only columns in the newly
 995              created slice. Example:
 996
 997              $aln2 = $aln->slice(20,30,1)
 998
 999 =cut
1000
1001 sub slice {
1002         my $self = shift;
1003         my ($start, $end, $keep_gap_only) = @_;
1004
1005         $self->throw("Slice start has to be a positive integer, not [$start]")
1006           unless $start =~ /^\d+$/ and $start > 0;
1007         $self->throw("Slice end has to be a positive integer, not [$end]")
1008           unless $end =~ /^\d+$/ and $end > 0;
1009         $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1010           unless $start <= $end;
1011         $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1012                                          "[$start] is too big.") if $start > $self->length;
1013     my $cons_meta = $self->consensus_meta;
1014         my $aln = $self->new;
1015         $aln->id($self->id);
1016         foreach my $seq ( $self->each_seq() ) {
1017             my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1018             Bio::Seq::Meta->new
1019         (-id      => $seq->id,
1020                  -alphabet => $seq->alphabet,
1021                  -strand  => $seq->strand,
1022                  -verbose => $self->verbose) :
1023             Bio::LocatableSeq->new
1024         (-id      => $seq->id,
1025                  -alphabet => $seq->alphabet,
1026                  -strand  => $seq->strand,
1027                  -verbose => $self->verbose);
1028
1029             # seq
1030             my $seq_end = $end;
1031             $seq_end = $seq->length if( $end > $seq->length );
1032
1033             my $slice_seq = $seq->subseq($start, $seq_end);
1034             $new_seq->seq( $slice_seq );
1035
1036             $slice_seq =~ s/\W//g;
1037
1038             if ($start > 1) {
1039             my $pre_start_seq = $seq->subseq(1, $start - 1);
1040             $pre_start_seq =~ s/\W//g;
1041             if (!defined($seq->strand)) {
1042                 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1043             } elsif ($seq->strand < 0){
1044                 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1045             } else {
1046                 $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
1047             }
1048             } else {
1049             $new_seq->start( $seq->start);
1050             }
1051         if ($new_seq->isa('Bio::Seq::MetaI')) {
1052             for my $meta_name ($seq->meta_names) {
1053                 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1054             }
1055         }
1056             $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1057
1058             if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1059             $aln->add_seq($new_seq);
1060             } else {
1061             if( $keep_gap_only ) {
1062                 $aln->add_seq($new_seq);
1063             } else {
1064                 my $nse = $seq->get_nse();
1065                 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1066                     " Sequence excluded from the new alignment.");
1067             }
1068             }
1069         }
1070     if ($cons_meta) {
1071         my $new = Bio::Seq::Meta->new();
1072         for my $meta_name ($cons_meta->meta_names) {
1073             $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1074         }
1075         $aln->consensus_meta($new);
1076     }
1077     $aln->annotation($self->annotation);
1078     # fix for meta, sf, ann
1079         return $aln;
1080 }
1081
1082 =head2 remove_columns
1083
1084  Title     : remove_columns
1085  Usage     : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1086              $aln2 = $aln->remove_columns([0,0],[6,8])
1087  Function  : Creates an aligment with columns removed corresponding to
1088              the specified type or by specifying the columns by number.
1089  Returns   : Bio::SimpleAlign object
1090  Args      : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1091              'all_gaps_columns') or array ref where the referenced array
1092              contains a pair of integers that specify a range.
1093              The first column is 0,
1094
1095 =cut
1096
1097 sub remove_columns {
1098         my ($self,@args) = @_;
1099         @args || return $self;
1100    my $aln;
1101
1102         if ($args[0][0] =~ /^[a-z_]+$/i) {
1103                  $aln = $self->_remove_columns_by_type($args[0]);
1104         } elsif ($args[0][0] =~ /^\d+$/) {
1105        $aln = $self->_remove_columns_by_num(\@args);
1106         } else {
1107                  $self->throw("You must pass array references to remove_columns(), not @args");
1108         }
1109     # fix for meta, sf, ann
1110    $aln;
1111 }
1112
1113
1114 =head2 remove_gaps
1115
1116  Title     : remove_gaps
1117  Usage     : $aln2 = $aln->remove_gaps
1118  Function  : Creates an aligment with gaps removed
1119  Returns   : a Bio::SimpleAlign object
1120  Args      : a gap character(optional) if none specified taken
1121                 from $self->gap_char,
1122              [optional] $all_gaps_columns flag (1 or 0, default is 0)
1123                         indicates that only all-gaps columns should be deleted
1124
1125 Used from method L<remove_columns> in most cases. Set gap character
1126 using L<gap_char()|gap_char>.
1127
1128 =cut
1129
1130 sub remove_gaps {
1131     my ($self,$gapchar,$all_gaps_columns) = @_;
1132     my $gap_line;
1133     if ($all_gaps_columns) {
1134         $gap_line = $self->all_gap_line($gapchar);
1135     } else {
1136         $gap_line = $self->gap_line($gapchar);
1137     }
1138     my $aln = $self->new;
1139
1140     my @remove;
1141     my $length = 0;
1142     my $del_char = $gapchar || $self->gap_char;
1143     # Do the matching to get the segments to remove
1144     while ($gap_line =~ m/[$del_char]/g) {
1145         my $start = pos($gap_line)-1;
1146         $gap_line=~/\G[$del_char]+/gc;
1147         my $end = pos($gap_line)-1;
1148
1149         #have to offset the start and end for subsequent removes
1150         $start-=$length;
1151         $end  -=$length;
1152         $length += ($end-$start+1);
1153         push @remove, [$start,$end];
1154     }
1155
1156     #remove the segments
1157     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1158     # fix for meta, sf, ann
1159     return $aln;
1160 }
1161
1162
1163 sub _remove_col {
1164     my ($self,$aln,$remove) = @_;
1165     my @new;
1166
1167     my $gap = $self->gap_char;
1168
1169     # splice out the segments and create new seq
1170     foreach my $seq($self->each_seq){
1171         my $new_seq = Bio::LocatableSeq->new(
1172                                              -id      => $seq->id,
1173                                              -alphabet=> $seq->alphabet,
1174                                              -strand  => $seq->strand,
1175                                              -verbose => $self->verbose);
1176         my $sequence = $seq->seq;
1177         foreach my $pair(@{$remove}){
1178             my $start = $pair->[0];
1179             my $end   = $pair->[1];
1180             $sequence = $seq->seq unless $sequence;
1181             my $orig = $sequence;
1182             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
1183             my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1184             $sequence = $head.$tail;
1185             # start
1186             unless (defined $new_seq->start) {
1187                 if ($start == 0) {
1188                     my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1189                     $new_seq->start($seq->start + $end + 1 - $start_adjust);
1190                 }
1191                 else {
1192                     my $start_adjust = $orig =~ /^$gap+/;
1193                     if ($start_adjust) {
1194                         $start_adjust = $+[0] == $start;
1195                     }
1196                     $new_seq->start($seq->start + $start_adjust);
1197                 }
1198             }
1199             # end
1200             if (($end + 1) >= length($orig)) {
1201                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1202                 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1203             }
1204             else {
1205                 $new_seq->end($seq->end);
1206             }
1207         }
1208
1209         if ($new_seq->end < $new_seq->start) {
1210             # we removed all columns except for gaps: set to 0 to indicate no
1211             # sequence
1212             $new_seq->start(0);
1213             $new_seq->end(0);
1214         }
1215
1216         $new_seq->seq($sequence) if $sequence;
1217                 push @new, $new_seq;
1218     }
1219     # add the new seqs to the alignment
1220     foreach my $new(@new){
1221         $aln->add_seq($new);
1222     }
1223     # fix for meta, sf, ann
1224     return $aln;
1225 }
1226
1227 sub _remove_columns_by_type {
1228         my ($self,$type) = @_;
1229         my $aln = $self->new;
1230         my @remove;
1231
1232         my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1233         my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1234         my %matchchars = ( 'match'           => '\*',
1235                        'weak'             => '\.',
1236                        'strong'           => ':',
1237                        'mismatch'         => ' ',
1238                        'gaps'             => '',
1239                        'all_gaps_columns' => ''
1240                      );
1241         # get the characters to delete against
1242         my $del_char;
1243         foreach my $type (@{$type}){
1244                 $del_char.= $matchchars{$type};
1245         }
1246
1247         my $length = 0;
1248         my $match_line = $self->match_line;
1249         # do the matching to get the segments to remove
1250         if($del_char){
1251                 while($match_line =~ m/[$del_char]/g ){
1252                         my $start = pos($match_line)-1;
1253                         $match_line=~/\G[$del_char]+/gc;
1254                         my $end = pos($match_line)-1;
1255
1256                         #have to offset the start and end for subsequent removes
1257                         $start-=$length;
1258                         $end  -=$length;
1259                         $length += ($end-$start+1);
1260                         push @remove, [$start,$end];
1261                 }
1262         }
1263
1264         # remove the segments
1265         $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1266         $aln = $aln->remove_gaps() if $gap;
1267         $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1268     # fix for meta, sf, ann
1269         $aln;
1270 }
1271
1272
1273 sub _remove_columns_by_num {
1274         my ($self,$positions) = @_;
1275         my $aln = $self->new;
1276
1277         # sort the positions
1278         @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1279
1280     my @remove;
1281     my $length = 0;
1282     foreach my $pos (@{$positions}) {
1283         my ($start, $end) = @{$pos};
1284
1285         #have to offset the start and end for subsequent removes
1286         $start-=$length;
1287         $end  -=$length;
1288         $length += ($end-$start+1);
1289         push @remove, [$start,$end];
1290     }
1291
1292     #remove the segments
1293     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1294     # fix for meta, sf, ann
1295         $aln;
1296 }
1297
1298
1299 =head1 Change sequences within the MSA
1300
1301 These methods affect characters in all sequences without changing the
1302 alignment.
1303
1304 =head2 splice_by_seq_pos
1305
1306  Title   : splice_by_seq_pos
1307  Usage   : $status = splice_by_seq_pos(1);
1308  Function: splices all aligned sequences where the specified sequence
1309            has gaps.
1310  Example :
1311  Returns : 1 on success
1312  Args    : position of sequence to splice by
1313
1314
1315 =cut
1316
1317 sub splice_by_seq_pos{
1318   my ($self,$pos) = @_;
1319
1320   my $guide = $self->get_seq_by_pos($pos);
1321   my $guide_seq = $guide->seq;
1322
1323   $guide_seq =~ s/\./\-/g;
1324
1325   my @gaps = ();
1326   $pos = -1;
1327   while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1328     unshift @gaps, $pos;
1329     $pos++;
1330   }
1331
1332   foreach my $seq ($self->each_seq){
1333     my @bases = split '', $seq->seq;
1334
1335     splice(@bases, $_, 1) foreach @gaps;
1336     $seq->seq(join('', @bases));
1337   }
1338
1339   1;
1340 }
1341
1342 =head2 map_chars
1343
1344  Title     : map_chars
1345  Usage     : $ali->map_chars('\.','-')
1346  Function  : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1347              characters
1348
1349              Notice that the from (arg1) is interpretted as a regex,
1350              so be careful about quoting meta characters (eg
1351              $ali->map_chars('.','-') wont do what you want)
1352  Returns   :
1353  Argument  : 'from' rexexp
1354              'to' string
1355
1356 =cut
1357
1358 sub map_chars {
1359     my $self = shift;
1360     my $from = shift;
1361     my $to   = shift;
1362     my ($seq,$temp);
1363
1364     $self->throw("Need exactly two arguments")
1365         unless defined $from and defined $to;
1366
1367     foreach $seq ( $self->each_seq() ) {
1368         $temp = $seq->seq();
1369         $temp =~ s/$from/$to/g;
1370         $seq->seq($temp);
1371     }
1372     return 1;
1373 }
1374
1375
1376 =head2 uppercase
1377
1378  Title     : uppercase()
1379  Usage     : $ali->uppercase()
1380  Function  : Sets all the sequences to uppercase
1381  Returns   :
1382  Argument  :
1383
1384 =cut
1385
1386 sub uppercase {
1387     my $self = shift;
1388     my $seq;
1389     my $temp;
1390
1391     foreach $seq ( $self->each_seq() ) {
1392       $temp = $seq->seq();
1393       $temp =~ tr/[a-z]/[A-Z]/;
1394
1395       $seq->seq($temp);
1396     }
1397     return 1;
1398 }
1399
1400 =head2 cigar_line
1401
1402  Title    : cigar_line()
1403  Usage    : %cigars = $align->cigar_line()
1404  Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1405             Report) line for each sequence in the alignment. Examples are
1406             "1,60" or "5,10:12,58", where the numbers refer to conserved
1407             positions within the alignment. The keys of the hash are the
1408             NSEs (name/start/end) assigned to each sequence.
1409  Args     : threshold (optional, defaults to 100)
1410  Returns  : Hash of strings (cigar lines)
1411
1412 =cut
1413
1414 sub cigar_line {
1415         my $self = shift;
1416         my $thr=shift||100;
1417         my %cigars;
1418
1419         my @consensus = split "",($self->consensus_string($thr));
1420         my $len = $self->length;
1421         my $gapchar = $self->gap_char;
1422
1423         # create a precursor, something like (1,4,5,6,7,33,45),
1424         # where each number corresponds to a conserved position
1425         foreach my $seq ( $self->each_seq ) {
1426                 my @seq = split "", uc ($seq->seq);
1427                 my $pos = 1;
1428                 for (my $x = 0 ; $x < $len ; $x++ ) {
1429                         if ($seq[$x] eq $consensus[$x]) {
1430                                 push @{$cigars{$seq->get_nse}},$pos;
1431                                 $pos++;
1432                         } elsif ($seq[$x] ne $gapchar) {
1433                                 $pos++;
1434                         }
1435                 }
1436         }
1437         # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1438         for my $name (keys %cigars) {
1439                 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1440                   ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1441       push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1442            ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1443                           ${$cigars{$name}}[$#{$cigars{$name}}] );
1444                 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1445                         if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x]  &&
1446                        ${$cigars{$name}}[$x + 1]  > ${$cigars{$name}}[$x] + 1) {
1447                  splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1448                         }
1449       }
1450         }
1451   # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1452   for my $name (keys %cigars) {
1453           my @remove;
1454           for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1455                    if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1456                              ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1457                       unshift @remove,$x;
1458               }
1459            }
1460       for my $pos (@remove) {
1461                         splice @{$cigars{$name}}, $pos, 1;
1462            }
1463    }
1464    # join and punctuate
1465    for my $name (keys %cigars) {
1466           my ($start,$end,$str) = "";
1467           while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1468                   $str .= ($start . "," . $end . ":");
1469           }
1470           $str =~ s/:$//;
1471       $cigars{$name} = $str;
1472    }
1473    %cigars;
1474 }
1475
1476
1477 =head2 match_line
1478
1479  Title    : match_line()
1480  Usage    : $line = $align->match_line()
1481  Function : Generates a match line - much like consensus string
1482             except that a line indicating the '*' for a match.
1483  Args     : (optional) Match line characters ('*' by default)
1484             (optional) Strong match char (':' by default)
1485             (optional) Weak match char ('.' by default)
1486  Returns  : String
1487
1488 =cut
1489
1490 sub match_line {
1491         my ($self,$matchlinechar, $strong, $weak) = @_;
1492         my %matchchars = ('match'    => $matchlinechar || '*',
1493                                                           'weak'     => $weak          || '.',
1494                                                           'strong'   => $strong        || ':',
1495                                                           'mismatch' => ' ',
1496                                                   );
1497
1498         my @seqchars;
1499         my $alphabet;
1500         foreach my $seq ( $self->each_seq ) {
1501                 push @seqchars, [ split(//, uc ($seq->seq)) ];
1502                 $alphabet = $seq->alphabet unless defined $alphabet;
1503         }
1504         my $refseq = shift @seqchars;
1505         # let's just march down the columns
1506         my $matchline;
1507  POS:
1508         foreach my $pos ( 0..$self->length ) {
1509                 my $refchar = $refseq->[$pos];
1510                 my $char = $matchchars{'mismatch'};
1511                 unless( defined $refchar ) {
1512                         last if $pos == $self->length; # short circuit on last residue
1513                         # this in place to handle jason's soon-to-be-committed
1514                         # intron mapping code
1515                         goto bottom;
1516                 }
1517                 my %col = ($refchar => 1);
1518                 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1519                 foreach my $seq ( @seqchars ) {
1520                         next if $pos >= scalar @$seq;
1521                         $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1522                                                           $seq->[$pos] eq ' ' );
1523                         $col{$seq->[$pos]}++ if defined $seq->[$pos];
1524                 }
1525                 my @colresidues = sort keys %col;
1526
1527                 # if all the values are the same
1528                 if( $dash ) { $char =  $matchchars{'mismatch'} }
1529                 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1530                 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1531                         # matches for protein seqs
1532             TYPE:
1533                         foreach my $type ( qw(strong weak) ) {
1534                                 # iterate through categories
1535                                 my %groups;
1536                                 # iterate through each of the aa in the col
1537                                 # look to see which groups it is in
1538                                 foreach my $c ( @colresidues ) {
1539                                         foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1540                                                 push @{$groups{$f}},$c;
1541                                         }
1542                                 }
1543                          GRP:
1544                                 foreach my $cols ( values %groups ) {
1545                                         @$cols = sort @$cols;
1546                                         # now we are just testing to see if two arrays
1547                                         # are identical w/o changing either one
1548                                         # have to be same len
1549                                         next if( scalar @$cols != scalar @colresidues );
1550                                         # walk down the length and check each slot
1551                                         for($_=0;$_ < (scalar @$cols);$_++ ) {
1552                                                 next GRP if( $cols->[$_] ne $colresidues[$_] );
1553                                         }
1554                                         $char = $matchchars{$type};
1555                                         last TYPE;
1556                                 }
1557                         }
1558                 }
1559          bottom:
1560                 $matchline .= $char;
1561         }
1562         return $matchline;
1563 }
1564
1565
1566 =head2 gap_line
1567
1568  Title    : gap_line()
1569  Usage    : $line = $align->gap_line()
1570  Function : Generates a gap line - much like consensus string
1571             except that a line where '-' represents gap
1572  Args     : (optional) gap line characters ('-' by default)
1573  Returns  : string
1574
1575 =cut
1576
1577 sub gap_line {
1578     my ($self,$gapchar) = @_;
1579     $gapchar = $gapchar || $self->gap_char;
1580     my %gap_hsh; # column gaps vector
1581     foreach my $seq ( $self->each_seq ) {
1582                 my $i = 0;
1583         map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1584                   map {[$i++, $_]} split(//, uc ($seq->seq));
1585     }
1586     my $gap_line;
1587     foreach my $pos ( 0..$self->length-1 ) {
1588           $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1589     }
1590     return $gap_line;
1591 }
1592
1593 =head2 all_gap_line
1594
1595  Title    : all_gap_line()
1596  Usage    : $line = $align->all_gap_line()
1597  Function : Generates a gap line - much like consensus string
1598             except that a line where '-' represents all-gap column
1599  Args     : (optional) gap line characters ('-' by default)
1600  Returns  : string
1601
1602 =cut
1603
1604 sub all_gap_line {
1605     my ($self,$gapchar) = @_;
1606     $gapchar = $gapchar || $self->gap_char;
1607     my %gap_hsh;                # column gaps counter hash
1608     my @seqs = $self->each_seq;
1609     foreach my $seq ( @seqs ) {
1610         my $i = 0;
1611         map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1612         map {[$i++, $_]} split(//, uc ($seq->seq));
1613     }
1614     my $gap_line;
1615     foreach my $pos ( 0..$self->length-1 ) {
1616         if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1617             # gaps column
1618             $gap_line .= $gapchar;
1619         } else {
1620             $gap_line .= '.';
1621         }
1622     }
1623     return $gap_line;
1624 }
1625
1626 =head2 gap_col_matrix
1627
1628  Title    : gap_col_matrix()
1629  Usage    : my $cols = $align->gap_col_matrix()
1630  Function : Generates an array of hashes where
1631             each entry in the array is a hash reference
1632             with keys of all the sequence names and
1633             and value of 1 or 0 if the sequence has a gap at that column
1634  Args     : (optional) gap line characters ($aln->gap_char or '-' by default)
1635
1636 =cut
1637
1638 sub gap_col_matrix {
1639     my ($self,$gapchar) = @_;
1640     $gapchar = $gapchar || $self->gap_char;
1641     my %gap_hsh; # column gaps vector
1642     my @cols;
1643     foreach my $seq ( $self->each_seq ) {
1644         my $i = 0;
1645         my $str = $seq->seq;
1646         my $len = $seq->length;
1647         my $ch;
1648         my $id = $seq->display_id;
1649         while( $i < $len ) {
1650             $ch = substr($str, $i, 1);
1651             $cols[$i++]->{$id} = ($ch eq $gapchar);
1652         }
1653     }
1654     return \@cols;
1655 }
1656
1657 =head2 match
1658
1659  Title     : match()
1660  Usage     : $ali->match()
1661  Function  : Goes through all columns and changes residues that are
1662              identical to residue in first sequence to match '.'
1663              character. Sets match_char.
1664
1665              USE WITH CARE: Most MSA formats do not support match
1666              characters in sequences, so this is mostly for output
1667              only. NEXUS format (Bio::AlignIO::nexus) can handle
1668              it.
1669  Returns   : 1
1670  Argument  : a match character, optional, defaults to '.'
1671
1672 =cut
1673
1674 sub match {
1675     my ($self, $match) = @_;
1676
1677     $match ||= '.';
1678     my ($matching_char) = $match;
1679     $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #';
1680     $self->map_chars($matching_char, '-');
1681
1682     my @seqs = $self->each_seq();
1683     return 1 unless scalar @seqs > 1;
1684
1685     my $refseq = shift @seqs ;
1686     my @refseq = split //, $refseq->seq;
1687     my $gapchar = $self->gap_char;
1688
1689     foreach my $seq ( @seqs ) {
1690         my @varseq = split //, $seq->seq();
1691         for ( my $i=0; $i < scalar @varseq; $i++) {
1692             $varseq[$i] = $match if defined $refseq[$i] &&
1693                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1694                   $refseq[$i] =~ /$gapchar/ )
1695                       && $refseq[$i] eq $varseq[$i];
1696         }
1697         $seq->seq(join '', @varseq);
1698     }
1699     $self->match_char($match);
1700     return 1;
1701 }
1702
1703
1704 =head2 unmatch
1705
1706  Title     : unmatch()
1707  Usage     : $ali->unmatch()
1708  Function  : Undoes the effect of method match. Unsets match_char.
1709  Returns   : 1
1710  Argument  : a match character, optional, defaults to '.'
1711
1712 See L<match> and L<match_char>
1713
1714 =cut
1715
1716 sub unmatch {
1717     my ($self, $match) = @_;
1718
1719     $match ||= '.';
1720
1721     my @seqs = $self->each_seq();
1722     return 1 unless scalar @seqs > 1;
1723
1724     my $refseq = shift @seqs ;
1725     my @refseq = split //, $refseq->seq;
1726     my $gapchar = $self->gap_char;
1727     foreach my $seq ( @seqs ) {
1728         my @varseq = split //, $seq->seq();
1729         for ( my $i=0; $i < scalar @varseq; $i++) {
1730             $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1731                 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1732                   $refseq[$i] =~ /$gapchar/ ) &&
1733                       $varseq[$i] eq $match;
1734         }
1735         $seq->seq(join '', @varseq);
1736     }
1737     $self->match_char('');
1738     return 1;
1739 }
1740
1741 =head1 MSA attributes
1742
1743 Methods for setting and reading the MSA attributes.
1744
1745 Note that the methods defining character semantics depend on the user
1746 to set them sensibly.  They are needed only by certain input/output
1747 methods. Unset them by setting to an empty string ('').
1748
1749 =head2 id
1750
1751  Title     : id
1752  Usage     : $myalign->id("Ig")
1753  Function  : Gets/sets the id field of the alignment
1754  Returns   : An id string
1755  Argument  : An id string (optional)
1756
1757 =cut
1758
1759 sub id {
1760     my ($self, $name) = @_;
1761
1762     if (defined( $name )) {
1763         $self->{'_id'} = $name;
1764     }
1765
1766     return $self->{'_id'};
1767 }
1768
1769 =head2 accession
1770
1771  Title     : accession
1772  Usage     : $myalign->accession("PF00244")
1773  Function  : Gets/sets the accession field of the alignment
1774  Returns   : An acc string
1775  Argument  : An acc string (optional)
1776
1777 =cut
1778
1779 sub accession {
1780     my ($self, $acc) = @_;
1781
1782     if (defined( $acc )) {
1783         $self->{'_accession'} = $acc;
1784     }
1785
1786     return $self->{'_accession'};
1787 }
1788
1789 =head2 description
1790
1791  Title     : description
1792  Usage     : $myalign->description("14-3-3 proteins")
1793  Function  : Gets/sets the description field of the alignment
1794  Returns   : An description string
1795  Argument  : An description string (optional)
1796
1797 =cut
1798
1799 sub description {
1800     my ($self, $name) = @_;
1801
1802     if (defined( $name )) {
1803         $self->{'_description'} = $name;
1804     }
1805
1806     return $self->{'_description'};
1807 }
1808
1809 =head2 missing_char
1810
1811  Title     : missing_char
1812  Usage     : $myalign->missing_char("?")
1813  Function  : Gets/sets the missing_char attribute of the alignment
1814              It is generally recommended to set it to 'n' or 'N'
1815              for nucleotides and to 'X' for protein.
1816  Returns   : An missing_char string,
1817  Argument  : An missing_char string (optional)
1818
1819 =cut
1820
1821 sub missing_char {
1822     my ($self, $char) = @_;
1823
1824     if (defined $char ) {
1825         $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1826         $self->{'_missing_char'} = $char;
1827     }
1828
1829     return $self->{'_missing_char'};
1830 }
1831
1832 =head2 match_char
1833
1834  Title     : match_char
1835  Usage     : $myalign->match_char('.')
1836  Function  : Gets/sets the match_char attribute of the alignment
1837  Returns   : An match_char string,
1838  Argument  : An match_char string (optional)
1839
1840 =cut
1841
1842 sub match_char {
1843     my ($self, $char) = @_;
1844
1845     if (defined $char ) {
1846         $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1847         $self->{'_match_char'} = $char;
1848     }
1849
1850     return $self->{'_match_char'};
1851 }
1852
1853 =head2 gap_char
1854
1855  Title     : gap_char
1856  Usage     : $myalign->gap_char('-')
1857  Function  : Gets/sets the gap_char attribute of the alignment
1858  Returns   : An gap_char string, defaults to '-'
1859  Argument  : An gap_char string (optional)
1860
1861 =cut
1862
1863 sub gap_char {
1864     my ($self, $char) = @_;
1865
1866     if (defined $char || ! defined $self->{'_gap_char'} ) {
1867         $char= '-' unless defined $char;
1868         $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1869         $self->{'_gap_char'} = $char;
1870     }
1871     return $self->{'_gap_char'};
1872 }
1873
1874 =head2 symbol_chars
1875
1876  Title   : symbol_chars
1877  Usage   : my @symbolchars = $aln->symbol_chars;
1878  Function: Returns all the seen symbols (other than gaps)
1879  Returns : array of characters that are the seen symbols
1880  Args    : boolean to include the gap/missing/match characters
1881
1882 =cut
1883
1884 sub symbol_chars{
1885    my ($self,$includeextra) = @_;
1886
1887    unless ($self->{'_symbols'}) {
1888        foreach my $seq ($self->each_seq) {
1889            map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1890        }
1891    }
1892    my %copy = %{$self->{'_symbols'}};
1893    if( ! $includeextra ) {
1894        foreach my $char ( $self->gap_char, $self->match_char,
1895                           $self->missing_char) {
1896            delete $copy{$char} if( defined $char );
1897        }
1898    }
1899    return keys %copy;
1900 }
1901
1902 =head1 Alignment descriptors
1903
1904 These read only methods describe the MSA in various ways.
1905
1906
1907 =head2 score
1908
1909  Title     : score
1910  Usage     : $str = $ali->score()
1911  Function  : get/set a score of the alignment
1912  Returns   : a score for the alignment
1913  Argument  : an optional score to set
1914
1915 =cut
1916
1917 sub score {
1918   my $self = shift;
1919   $self->{score} = shift if @_;
1920   return $self->{score};
1921 }
1922
1923 =head2 consensus_string
1924
1925  Title     : consensus_string
1926  Usage     : $str = $ali->consensus_string($threshold_percent)
1927  Function  : Makes a strict consensus
1928  Returns   : Consensus string
1929  Argument  : Optional treshold ranging from 0 to 100.
1930              The consensus residue has to appear at least threshold %
1931              of the sequences at a given location, otherwise a '?'
1932              character will be placed at that location.
1933              (Default value = 0%)
1934
1935 =cut
1936
1937 sub consensus_string {
1938     my $self = shift;
1939     my $threshold = shift;
1940
1941     my $out = "";
1942     my $len = $self->length - 1;
1943
1944     foreach ( 0 .. $len ) {
1945         $out .= $self->_consensus_aa($_,$threshold);
1946     }
1947     return $out;
1948 }
1949
1950 sub _consensus_aa {
1951     my $self = shift;
1952     my $point = shift;
1953     my $threshold_percent = shift || -1 ;
1954     my ($seq,%hash,$count,$letter,$key);
1955     my $gapchar = $self->gap_char;
1956     foreach $seq ( $self->each_seq() ) {
1957         $letter = substr($seq->seq,$point,1);
1958         $self->throw("--$point-----------") if $letter eq '';
1959         ($letter eq $gapchar || $letter =~ /\./) && next;
1960         # print "Looking at $letter\n";
1961         $hash{$letter}++;
1962     }
1963     my $number_of_sequences = $self->no_sequences();
1964     my $threshold = $number_of_sequences * $threshold_percent / 100. ;
1965     $count = -1;
1966     $letter = '?';
1967
1968     foreach $key ( sort keys %hash ) {
1969         # print "Now at $key $hash{$key}\n";
1970         if( $hash{$key} > $count && $hash{$key} >= $threshold) {
1971             $letter = $key;
1972             $count = $hash{$key};
1973         }
1974     }
1975     return $letter;
1976 }
1977
1978
1979 =head2 consensus_iupac
1980
1981  Title     : consensus_iupac
1982  Usage     : $str = $ali->consensus_iupac()
1983  Function  : Makes a consensus using IUPAC ambiguity codes from DNA
1984              and RNA. The output is in upper case except when gaps in
1985              a column force output to be in lower case.
1986
1987              Note that if your alignment sequences contain a lot of
1988              IUPAC ambiquity codes you often have to manually set
1989              alphabet.  Bio::PrimarySeq::_guess_type thinks they
1990              indicate a protein sequence.
1991  Returns   : consensus string
1992  Argument  : none
1993  Throws    : on protein sequences
1994
1995 =cut
1996
1997 sub consensus_iupac {
1998     my $self = shift;
1999     my $out = "";
2000     my $len = $self->length-1;
2001
2002     # only DNA and RNA sequences are valid
2003     foreach my $seq ( $self->each_seq() ) {
2004         $self->throw("Seq [". $seq->get_nse. "] is a protein")
2005             if $seq->alphabet eq 'protein';
2006     }
2007     # loop over the alignment columns
2008     foreach my $count ( 0 .. $len ) {
2009         $out .= $self->_consensus_iupac($count);
2010     }
2011     return $out;
2012 }
2013
2014 sub _consensus_iupac {
2015     my ($self, $column) = @_;
2016     my ($string, $char, $rna);
2017
2018     #determine all residues in a column
2019     foreach my $seq ( $self->each_seq() ) {
2020         $string .= substr($seq->seq, $column, 1);
2021     }
2022     $string = uc $string;
2023
2024     # quick exit if there's an N in the string
2025     if ($string =~ /N/) {
2026         $string =~ /\W/ ? return 'n' : return 'N';
2027     }
2028     # ... or if there are only gap characters
2029     return '-' if $string =~ /^\W+$/;
2030
2031     # treat RNA as DNA in regexps
2032     if ($string =~ /U/) {
2033         $string =~ s/U/T/;
2034         $rna = 1;
2035     }
2036
2037     # the following s///'s only need to be done to the _first_ ambiguity code
2038     # as we only need to see the _range_ of characters in $string
2039
2040     if ($string =~ /[VDHB]/) {
2041         $string =~ s/V/AGC/;
2042         $string =~ s/D/AGT/;
2043         $string =~ s/H/ACT/;
2044         $string =~ s/B/CTG/;
2045     }
2046
2047     if ($string =~ /[SKYRWM]/) {
2048         $string =~ s/S/GC/;
2049         $string =~ s/K/GT/;
2050         $string =~ s/Y/CT/;
2051         $string =~ s/R/AG/;
2052         $string =~ s/W/AT/;
2053         $string =~ s/M/AC/;
2054     }
2055
2056     # and now the guts of the thing
2057
2058     if ($string =~ /A/) {
2059         $char = 'A';                     # A                      A
2060         if ($string =~ /G/) {
2061             $char = 'R';                 # A and G (purines)      R
2062             if ($string =~ /C/) {
2063                 $char = 'V';             # A and G and C          V
2064                 if ($string =~ /T/) {
2065                     $char = 'N';         # A and G and C and T    N
2066                 }
2067             } elsif ($string =~ /T/) {
2068                 $char = 'D';             # A and G and T          D
2069             }
2070         } elsif ($string =~ /C/) {
2071             $char = 'M';                 # A and C                M
2072             if ($string =~ /T/) {
2073                 $char = 'H';             # A and C and T          H
2074             }
2075         } elsif ($string =~ /T/) {
2076             $char = 'W';                 # A and T                W
2077         }
2078     } elsif ($string =~ /C/) {
2079         $char = 'C';                     # C                      C
2080         if ($string =~ /T/) {
2081             $char = 'Y';                 # C and T (pyrimidines)  Y
2082             if ($string =~ /G/) {
2083                 $char = 'B';             # C and T and G          B
2084             }
2085         } elsif ($string =~ /G/) {
2086             $char = 'S';                 # C and G                S
2087         }
2088     } elsif ($string =~ /G/) {
2089         $char = 'G';                     # G                      G
2090         if ($string =~ /C/) {
2091             $char = 'S';                 # G and C                S
2092         } elsif ($string =~ /T/) {
2093             $char = 'K';                 # G and T                K
2094         }
2095     } elsif ($string =~ /T/) {
2096         $char = 'T';                     # T                      T
2097     }
2098
2099     $char = 'U' if $rna and $char eq 'T';
2100     $char = lc $char if $string =~ /\W/;
2101
2102     return $char;
2103 }
2104
2105
2106 =head2 consensus_meta
2107
2108  Title     : consensus_meta
2109  Usage     : $seqmeta = $ali->consensus_meta()
2110  Function  : Returns a Bio::Seq::Meta object containing the consensus
2111              strings derived from meta data analysis.
2112  Returns   : Bio::Seq::Meta
2113  Argument  : Bio::Seq::Meta
2114  Throws    : non-MetaI object
2115
2116 =cut
2117
2118 sub consensus_meta {
2119     my ($self, $meta) = @_;
2120     if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2121         $self->throw('Not a Bio::Seq::MetaI object');
2122     }
2123     return $self->{'_aln_meta'} = $meta if $meta;
2124     return $self->{'_aln_meta'}
2125 }
2126
2127 =head2 is_flush
2128
2129  Title     : is_flush
2130  Usage     : if ( $ali->is_flush() )
2131  Function  : Tells you whether the alignment
2132            : is flush, i.e. all of the same length
2133  Returns   : 1 or 0
2134  Argument  :
2135
2136 =cut
2137
2138 sub is_flush {
2139     my ($self,$report) = @_;
2140     my $seq;
2141     my $length = (-1);
2142     my $temp;
2143
2144     foreach $seq ( $self->each_seq() ) {
2145         if( $length == (-1) ) {
2146             $length = CORE::length($seq->seq());
2147             next;
2148         }
2149
2150         $temp = CORE::length($seq->seq());
2151         if( $temp != $length ) {
2152             $self->warn("expecting $length not $temp from ".
2153                         $seq->display_id) if( $report );
2154             $self->debug("expecting $length not $temp from ".
2155                          $seq->display_id);
2156             $self->debug($seq->seq(). "\n");
2157             return 0;
2158         }
2159     }
2160
2161     return 1;
2162 }
2163
2164
2165 =head2 length
2166
2167  Title     : length()
2168  Usage     : $len = $ali->length()
2169  Function  : Returns the maximum length of the alignment.
2170              To be sure the alignment is a block, use is_flush
2171  Returns   : Integer
2172  Argument  :
2173
2174 =cut
2175
2176 sub length_aln {
2177     my $self = shift;
2178     $self->deprecated("length_aln - deprecated method. Use length() instead.");
2179     $self->length(@_);
2180 }
2181
2182 sub length {
2183     my $self = shift;
2184     my $seq;
2185     my $length = -1;
2186     my $temp;
2187
2188     foreach $seq ( $self->each_seq() ) {
2189         $temp = $seq->length();
2190         if( $temp > $length ) {
2191             $length = $temp;
2192         }
2193     }
2194
2195     return $length;
2196 }
2197
2198
2199 =head2 maxdisplayname_length
2200
2201  Title     : maxdisplayname_length
2202  Usage     : $ali->maxdisplayname_length()
2203  Function  : Gets the maximum length of the displayname in the
2204              alignment. Used in writing out various MSA formats.
2205  Returns   : integer
2206  Argument  :
2207
2208 =cut
2209
2210 sub maxname_length {
2211     my $self = shift;
2212     $self->deprecated("maxname_length - deprecated method.".
2213                       " Use maxdisplayname_length() instead.");
2214     $self->maxdisplayname_length();
2215 }
2216
2217 sub maxnse_length {
2218     my $self = shift;
2219     $self->deprecated("maxnse_length - deprecated method.".
2220                       " Use maxnse_length() instead.");
2221     $self->maxdisplayname_length();
2222 }
2223
2224 sub maxdisplayname_length {
2225     my $self = shift;
2226     my $maxname = (-1);
2227     my ($seq,$len);
2228
2229     foreach $seq ( $self->each_seq() ) {
2230         $len = CORE::length $self->displayname($seq->get_nse());
2231
2232         if( $len > $maxname ) {
2233             $maxname = $len;
2234         }
2235     }
2236
2237     return $maxname;
2238 }
2239
2240 =head2 max_metaname_length
2241
2242  Title     : max_metaname_length
2243  Usage     : $ali->max_metaname_length()
2244  Function  : Gets the maximum length of the meta name tags in the
2245              alignment for the sequences and for the alignment.
2246              Used in writing out various MSA formats.
2247  Returns   : integer
2248  Argument  : None
2249
2250 =cut
2251
2252 sub max_metaname_length {
2253     my $self = shift;
2254     my $maxname = (-1);
2255     my ($seq,$len);
2256
2257     # check seq meta first
2258     for $seq ( $self->each_seq() ) {
2259         next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2260         for my $mtag ($seq->meta_names) {
2261             $len = CORE::length $mtag;
2262             if( $len > $maxname ) {
2263                 $maxname = $len;
2264             }
2265         }
2266     }
2267
2268     # alignment meta
2269     for my $meta ($self->consensus_meta) {
2270         next unless $meta;
2271         for my $name ($meta->meta_names) {
2272             $len = CORE::length $name;
2273             if( $len > $maxname ) {
2274                 $maxname = $len;
2275             }
2276         }
2277     }
2278
2279     return $maxname;
2280 }
2281
2282 =head2 no_residues
2283
2284  Title     : no_residues
2285  Usage     : $no = $ali->no_residues
2286  Function  : number of residues in total in the alignment
2287  Returns   : integer
2288  Argument  :
2289
2290 =cut
2291
2292 sub no_residues {
2293     my $self = shift;
2294     my $count = 0;
2295
2296     foreach my $seq ($self->each_seq) {
2297         my $str = $seq->seq();
2298
2299         $count += ($str =~ s/[A-Za-z]//g);
2300     }
2301
2302     return $count;
2303 }
2304
2305 =head2 no_sequences
2306
2307  Title     : no_sequences
2308  Usage     : $depth = $ali->no_sequences
2309  Function  : number of sequence in the sequence alignment
2310  Returns   : integer
2311  Argument  :
2312
2313 =cut
2314
2315 sub no_sequences {
2316     my $self = shift;
2317
2318     return scalar($self->each_seq);
2319 }
2320
2321
2322 =head2 average_percentage_identity
2323
2324  Title   : average_percentage_identity
2325  Usage   : $id = $align->average_percentage_identity
2326  Function: The function uses a fast method to calculate the average
2327            percentage identity of the alignment
2328  Returns : The average percentage identity of the alignment
2329  Args    : None
2330  Notes   : This method implemented by Kevin Howe calculates a figure that is
2331            designed to be similar to the average pairwise identity of the
2332            alignment (identical in the absence of gaps), without having to
2333            explicitly calculate pairwise identities proposed by Richard Durbin.
2334            Validated by Ewan Birney ad Alex Bateman.
2335
2336 =cut
2337
2338 sub average_percentage_identity{
2339    my ($self,@args) = @_;
2340
2341    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2342                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2343
2344    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2345
2346    if (! $self->is_flush()) {
2347        $self->throw("All sequences in the alignment must be the same length");
2348    }
2349
2350    @seqs = $self->each_seq();
2351    $len = $self->length();
2352
2353    # load the each hash with correct keys for existence checks
2354
2355    for( my $index=0; $index < $len; $index++) {
2356        foreach my $letter (@alphabet) {
2357            $countHashes[$index]->{$letter} = 0;
2358        }
2359    }
2360    foreach my $seq (@seqs)  {
2361        my @seqChars = split //, $seq->seq();
2362        for( my $column=0; $column < @seqChars; $column++ ) {
2363            my $char = uc($seqChars[$column]);
2364            if (exists $countHashes[$column]->{$char}) {
2365                $countHashes[$column]->{$char}++;
2366            }
2367        }
2368    }
2369
2370    $total = 0;
2371    $divisor = 0;
2372    for(my $column =0; $column < $len; $column++) {
2373        my %hash = %{$countHashes[$column]};
2374        $subdivisor = 0;
2375        foreach my $res (keys %hash) {
2376            $total += $hash{$res}*($hash{$res} - 1);
2377            $subdivisor += $hash{$res};
2378        }
2379        $divisor += $subdivisor * ($subdivisor - 1);
2380    }
2381    return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2382 }
2383
2384 =head2 percentage_identity
2385
2386  Title   : percentage_identity
2387  Usage   : $id = $align->percentage_identity
2388  Function: The function calculates the average percentage identity
2389            (aliased to average_percentage_identity)
2390  Returns : The average percentage identity
2391  Args    : None
2392
2393 =cut
2394
2395 sub percentage_identity {
2396     my $self = shift;
2397     return $self->average_percentage_identity();
2398 }
2399
2400 =head2 overall_percentage_identity
2401
2402  Title   : overall_percentage_identity
2403  Usage   : $id = $align->overall_percentage_identity
2404            $id = $align->overall_percentage_identity('short')
2405  Function: The function calculates the percentage identity of
2406            the conserved columns
2407  Returns : The percentage identity of the conserved columns
2408  Args    : length value to use, optional defaults to alignment length
2409                  possible values: 'align', 'short', 'long'
2410
2411 The argument values 'short' and 'long' refer to shortest and longest
2412 sequence in the alignment. Method modification code by Hongyu Zhang.
2413
2414 =cut
2415
2416 sub overall_percentage_identity{
2417    my ($self, $length_measure) = @_;
2418
2419    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2420                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2421
2422    my ($len, $total, @seqs, @countHashes);
2423
2424    my %enum = map {$_ => 1} qw (align short long);
2425
2426    $self->throw("Unknown argument [$length_measure]")
2427        if $length_measure and not $enum{$length_measure};
2428    $length_measure ||= 'align';
2429
2430    if (! $self->is_flush()) {
2431        $self->throw("All sequences in the alignment must be the same length");
2432    }
2433
2434    @seqs = $self->each_seq();
2435    $len = $self->length();
2436
2437    # load the each hash with correct keys for existence checks
2438    for( my $index=0; $index < $len; $index++) {
2439        foreach my $letter (@alphabet) {
2440            $countHashes[$index]->{$letter} = 0;
2441        }
2442    }
2443    foreach my $seq (@seqs)  {
2444        my @seqChars = split //, $seq->seq();
2445        for( my $column=0; $column < @seqChars; $column++ ) {
2446            my $char = uc($seqChars[$column]);
2447            if (exists $countHashes[$column]->{$char}) {
2448                $countHashes[$column]->{$char}++;
2449            }
2450        }
2451    }
2452
2453    $total = 0;
2454    for(my $column =0; $column < $len; $column++) {
2455        my %hash = %{$countHashes[$column]};
2456        foreach ( values %hash ) {
2457            next if( $_ == 0 );
2458            $total++ if( $_ == scalar @seqs );
2459            last;
2460        }
2461    }
2462
2463    if ($length_measure eq 'short') {
2464        ## find the shortest length
2465        $len = 0;
2466        foreach my $seq ($self->each_seq) {
2467            my $count = $seq->seq =~ tr/[A-Za-z]//;
2468            if ($len) {
2469                $len = $count if $count < $len;
2470            } else {
2471                $len = $count;
2472            }
2473        }
2474    }
2475    elsif ($length_measure eq 'long') {
2476        ## find the longest length
2477        $len = 0;
2478        foreach my $seq ($self->each_seq) {
2479            my $count = $seq->seq =~ tr/[A-Za-z]//;
2480            if ($len) {
2481                $len = $count if $count > $len;
2482            } else {
2483                $len = $count;
2484            }
2485        }
2486    }
2487
2488    return ($total / $len ) * 100.0;
2489 }
2490
2491
2492
2493 =head1 Alignment positions
2494
2495 Methods to map a sequence position into an alignment column and back.
2496 column_from_residue_number() does the former. The latter is really a
2497 property of the sequence object and can done using
2498 L<Bio::LocatableSeq::location_from_column>:
2499
2500     # select somehow a sequence from the alignment, e.g.
2501     my $seq = $aln->get_seq_by_pos(1);
2502     #$loc is undef or Bio::LocationI object
2503     my $loc = $seq->location_from_column(5);
2504
2505 =head2 column_from_residue_number
2506
2507  Title   : column_from_residue_number
2508  Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2509  Function: This function gives the position in the alignment
2510            (i.e. column number) of the given residue number in the
2511            sequence with the given name. For example, for the
2512            alignment
2513
2514              Seq1/91-97 AC..DEF.GH.
2515              Seq2/24-30 ACGG.RTY...
2516                 Seq3/43-51 AC.DDEF.GHI
2517
2518            column_from_residue_number( "Seq1", 94 ) returns 6.
2519            column_from_residue_number( "Seq2", 25 ) returns 2.
2520            column_from_residue_number( "Seq3", 50 ) returns 10.
2521
2522            An exception is thrown if the residue number would lie
2523            outside the length of the aligment
2524            (e.g. column_from_residue_number( "Seq2", 22 )
2525
2526           Note: If the the parent sequence is represented by more than
2527                 one alignment sequence and the residue number is present in
2528                 them, this method finds only the first one.
2529
2530  Returns : A column number for the position in the alignment of the
2531            given residue in the given sequence (1 = first column)
2532  Args    : A sequence id/name (not a name/start-end)
2533            A residue number in the whole sequence (not just that
2534            segment of it in the alignment)
2535
2536 =cut
2537
2538 sub column_from_residue_number {
2539     my ($self, $name, $resnumber) = @_;
2540
2541     $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2542     $self->throw("Second argument residue number missing") unless $resnumber;
2543
2544     foreach my $seq ($self->each_seq_with_id($name)) {
2545         my $col;
2546         eval {
2547             $col = $seq->column_from_residue_number($resnumber);
2548         };
2549         next if $@;
2550         return $col;
2551     }
2552
2553     $self->throw("Could not find a sequence segment in $name ".
2554                  "containing residue number $resnumber");
2555
2556 }
2557
2558 =head1 Sequence names
2559
2560 Methods to manipulate the display name. The default name based on the
2561 sequence id and subsequence positions can be overridden in various
2562 ways.
2563
2564 =head2 displayname
2565
2566  Title     : displayname
2567  Usage     : $myalign->displayname("Ig", "IgA")
2568  Function  : Gets/sets the display name of a sequence in the alignment
2569  Returns   : A display name string
2570  Argument  : name of the sequence
2571              displayname of the sequence (optional)
2572
2573 =cut
2574
2575 sub get_displayname {
2576     my $self = shift;
2577     $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2578     $self->displayname(@_);
2579 }
2580
2581 sub set_displayname {
2582     my $self = shift;
2583     $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2584     $self->displayname(@_);
2585 }
2586
2587 sub displayname {
2588     my ($self, $name, $disname) = @_;
2589
2590     $self->throw("No sequence with name [$name]")
2591         unless defined $self->{'_seq'}->{$name};
2592
2593     if(  $disname and  $name) {
2594         $self->{'_dis_name'}->{$name} = $disname;
2595         return $disname;
2596     }
2597     elsif( defined $self->{'_dis_name'}->{$name} ) {
2598         return  $self->{'_dis_name'}->{$name};
2599     } else {
2600         return $name;
2601     }
2602 }
2603
2604 =head2 set_displayname_count
2605
2606  Title     : set_displayname_count
2607  Usage     : $ali->set_displayname_count
2608  Function  : Sets the names to be name_# where # is the number of
2609              times this name has been used.
2610  Returns   : 1, on success
2611  Argument  :
2612
2613 =cut
2614
2615 sub set_displayname_count {
2616     my $self= shift;
2617     my (@arr,$name,$seq,$count,$temp,$nse);
2618
2619     foreach $seq ( $self->each_alphabetically() ) {
2620         $nse = $seq->get_nse();
2621
2622         #name will be set when this is the second
2623         #time (or greater) is has been seen
2624
2625         if( defined $name and $name eq ($seq->id()) ) {
2626             $temp = sprintf("%s_%s",$name,$count);
2627             $self->displayname($nse,$temp);
2628             $count++;
2629         } else {
2630             $count = 1;
2631             $name = $seq->id();
2632             $temp = sprintf("%s_%s",$name,$count);
2633             $self->displayname($nse,$temp);
2634             $count++;
2635         }
2636     }
2637     return 1;
2638 }
2639
2640 =head2 set_displayname_flat
2641
2642  Title     : set_displayname_flat
2643  Usage     : $ali->set_displayname_flat()
2644  Function  : Makes all the sequences be displayed as just their name,
2645              not name/start-end
2646  Returns   : 1
2647  Argument  :
2648
2649 =cut
2650
2651 sub set_displayname_flat {
2652     my $self = shift;
2653     my ($nse,$seq);
2654
2655     foreach $seq ( $self->each_seq() ) {
2656         $nse = $seq->get_nse();
2657         $self->displayname($nse,$seq->id());
2658     }
2659     return 1;
2660 }
2661
2662 =head2 set_displayname_normal
2663
2664  Title     : set_displayname_normal
2665  Usage     : $ali->set_displayname_normal()
2666  Function  : Makes all the sequences be displayed as name/start-end
2667  Returns   : 1, on success
2668  Argument  :
2669
2670 =cut
2671
2672 sub set_displayname_normal {
2673     my $self = shift;
2674     my ($nse,$seq);
2675
2676     foreach $seq ( $self->each_seq() ) {
2677         $nse = $seq->get_nse();
2678         $self->displayname($nse,$nse);
2679     }
2680     return 1;
2681 }
2682
2683 =head2 source
2684
2685  Title   : source
2686  Usage   : $obj->source($newval)
2687  Function: sets the Alignment source program
2688  Example :
2689  Returns : value of source
2690  Args    : newvalue (optional)
2691
2692
2693 =cut
2694
2695 sub source{
2696    my ($self,$value) = @_;
2697    if( defined $value) {
2698       $self->{'_source'} = $value;
2699     }
2700     return $self->{'_source'};
2701 }
2702
2703 =head2 set_displayname_safe
2704
2705  Title     : set_displayname_safe
2706  Usage     : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2707  Function  : Assign machine-generated serial names to sequences in input order.
2708              Designed to protect names during PHYLIP runs. Assign 10-char string
2709              in the form of "S000000001" to "S999999999". Restore the original
2710              names using "restore_displayname".
2711  Returns   : 1. a new $aln with system names;
2712              2. a hash ref for restoring names
2713  Argument  : Number for id length (default 10)
2714
2715 =cut
2716
2717 sub set_displayname_safe {
2718     my $self = shift;
2719     my $idlength = shift || 10;
2720     my ($seq, %phylip_name);
2721     my $ct=0;
2722     my $new=Bio::SimpleAlign->new();
2723     foreach $seq ( $self->each_seq() ) {
2724         $ct++;
2725         my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2726         $phylip_name{$pname}=$seq->id();
2727         my $new_seq= Bio::LocatableSeq->new(-id       => $pname,
2728                                             -seq      => $seq->seq(),
2729                                             -alphabet => $seq->alphabet,
2730                                             -start    => $seq->{_start},
2731                                             -end      => $seq->{_end}
2732                                             );
2733         $new->add_seq($new_seq);
2734     }
2735
2736     $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2737     return ($new, \%phylip_name);
2738 }
2739
2740 =head2 restore_displayname
2741
2742  Title     : restore_displayname
2743  Usage     : $aln_name_restored=$ali->restore_displayname($hash_ref)
2744  Function  : Restore original sequence names (after running
2745              $ali->set_displayname_safe)
2746  Returns   : a new $aln with names restored.
2747  Argument  : a hash reference of names from "set_displayname_safe".
2748
2749 =cut
2750
2751 sub restore_displayname {
2752     my $self = shift;
2753     my $ref=shift;
2754     my %name=%$ref;
2755     my $new=Bio::SimpleAlign->new();
2756     foreach my $seq ( $self->each_seq() ) {
2757       $self->throw("No sequence with name") unless defined $name{$seq->id()};
2758       my $new_seq= Bio::LocatableSeq->new(-id       => $name{$seq->id()},
2759                                           -seq      => $seq->seq(),
2760                                           -alphabet => $seq->alphabet,
2761                                           -start    => $seq->{_start},
2762                                           -end      => $seq->{_end}
2763                                           );
2764       $new->add_seq($new_seq);
2765     }
2766     return $new;
2767 }
2768
2769 =head2 sort_by_start
2770
2771  Title     : sort_by_start
2772  Usage     : $ali->sort_by_start
2773  Function  : Changes the order of the alignment to the start position of each
2774              subalignment
2775  Returns   :
2776  Argument  :
2777
2778 =cut
2779
2780 sub sort_by_start {
2781     my $self = shift;
2782     my ($seq,$nse,@arr,%hash,$count);
2783     foreach $seq ( $self->each_seq() ) {
2784         $nse = $seq->get_nse;
2785         $hash{$nse} = $seq;
2786     }
2787     $count = 0;
2788     %{$self->{'_order'}} = (); # reset the hash;
2789     foreach $nse ( sort _startend keys %hash) {
2790         $self->{'_order'}->{$count} = $nse;
2791         $count++;
2792     }
2793     1;
2794 }
2795
2796 sub _startend
2797 {
2798     my ($aname,$arange) = split (/[\/]/,$a);
2799     my ($bname,$brange) = split (/[\/]/,$b);
2800     my ($astart,$aend) = split(/\-/,$arange);
2801     my ($bstart,$bend) = split(/\-/,$brange);
2802     return $astart <=> $bstart;
2803 }
2804
2805 =head2 bracket_string
2806
2807  Title     : bracket_string
2808  Usage     : my @params = (-refseq     => 'testseq',
2809                            -allele1    => 'allele1',
2810                            -allele2    => 'allele2',
2811                            -delimiters => '{}',
2812                            -separator  => '/');
2813              $str = $aln->bracket_string(@params)
2814
2815  Function :  When supplied with a list of parameters (see below), returns a
2816              string in BIC format. This is used for allelic comparisons.
2817              Briefly, if either allele contains a base change when compared to
2818              the refseq, the base or gap for each allele is represented in
2819              brackets in the order present in the 'alleles' parameter.
2820
2821              For the following data:
2822
2823              >testseq
2824              GGATCCATTGCTACT
2825              >allele1
2826              GGATCCATTCCTACT
2827              >allele2
2828              GGAT--ATTCCTCCT
2829
2830              the returned string with parameters 'refseq => testseq' and
2831              'alleles => [qw(allele1 allele2)]' would be:
2832
2833              GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2834  Returns   : BIC-formatted string
2835  Argument  : Required args
2836                 refseq    : string (ID) of the reference sequence used
2837                             as basis for comparison
2838                 allele1   : string (ID) of the first allele
2839                 allele2   : string (ID) of the second allele
2840              Optional args
2841                 delimiters: two symbol string of left and right delimiters.
2842                             Only the first two symbols are used
2843                             default = '[]'
2844                 separator : string used as a separator.  Only the first
2845                             symbol is used
2846                             default = '/'
2847  Throws    : On no refseq/alleles, or invalid refseq/alleles.
2848
2849 =cut
2850
2851 sub bracket_string {
2852     my ($self, @args) = @_;
2853     my ($ref, $a1, $a2, $delim, $sep) =
2854         $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2855     $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2856     my ($ld, $rd);
2857     ($ld, $rd) = split('', $delim, 2) if $delim;
2858     $ld ||= '[';
2859     $rd ||= ']';
2860     $sep ||= '/';
2861     my ($refseq, $allele1, $allele2) =
2862         map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2863     if (!$refseq || !$allele1 || !$allele2) {
2864         $self->throw("One of your refseq/allele IDs is invalid!");
2865     }
2866     my $len = $self->length-1;
2867     my $bic = '';
2868     # loop over the alignment columns
2869     for my $column ( 0 .. $len ) {
2870         my $string;
2871         my ($compres, $res1, $res2) =
2872             map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2873         # are any of the allele symbols different from the refseq?
2874         $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2875                 $ld.$res1.$sep.$res2.$rd;
2876         $bic .= $string;
2877     }
2878     return $bic;
2879 }
2880
2881
2882 =head2 methods for Bio::FeatureHolder
2883
2884 FeatureHolder implementation to support labeled character sets like one
2885 would get from NEXUS represented data.
2886
2887 =head2 get_SeqFeatures
2888
2889  Usage   :
2890  Function: Get the feature objects held by this feature holder.
2891  Example :
2892  Returns : an array of Bio::SeqFeatureI implementing objects
2893  Args    : none
2894
2895 At some day we may want to expand this method to allow for a feature
2896 filter to be passed in.
2897
2898 =cut
2899
2900 sub get_SeqFeatures {
2901     my $self = shift;
2902
2903     if( !defined $self->{'_as_feat'} ) {
2904         $self->{'_as_feat'} = [];
2905     }
2906     return @{$self->{'_as_feat'}};
2907 }
2908
2909 =head2 add_SeqFeature
2910
2911  Usage   : $feat->add_SeqFeature($subfeat);
2912            $feat->add_SeqFeature($subfeat,'EXPAND')
2913  Function: adds a SeqFeature into the subSeqFeature array.
2914            with no 'EXPAND' qualifer, subfeat will be tested
2915            as to whether it lies inside the parent, and throw
2916            an exception if not.
2917
2918            If EXPAND is used, the parent''s start/end/strand will
2919            be adjusted so that it grows to accommodate the new
2920            subFeature
2921  Example :
2922  Returns : nothing
2923  Args    : a Bio::SeqFeatureI object
2924
2925 =cut
2926
2927 sub add_SeqFeature {
2928    my ($self,@feat) = @_;
2929
2930    $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2931
2932    foreach my $feat ( @feat ) {
2933        if( !$feat->isa("Bio::SeqFeatureI") ) {
2934            $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2935        }
2936
2937        push(@{$self->{'_as_feat'}},$feat);
2938    }
2939    return 1;
2940 }
2941
2942
2943 =head2 remove_SeqFeatures
2944
2945  Usage   : $obj->remove_SeqFeatures
2946  Function: Removes all sub SeqFeatures.  If you want to remove only a subset,
2947            remove that subset from the returned array, and add back the rest.
2948  Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2949            deleted from this feature.
2950  Args    : none
2951
2952 =cut
2953
2954 sub remove_SeqFeatures {
2955     my $self = shift;
2956
2957     return () unless $self->{'_as_feat'};
2958     my @feats = @{$self->{'_as_feat'}};
2959     $self->{'_as_feat'} = [];
2960     return @feats;
2961 }
2962
2963 =head2 feature_count
2964
2965  Title   : feature_count
2966  Usage   : $obj->feature_count()
2967  Function: Return the number of SeqFeatures attached to a feature holder.
2968
2969            This is before flattening a possible sub-feature tree.
2970
2971            We provide a default implementation here that just counts
2972            the number of objects returned by get_SeqFeatures().
2973            Implementors may want to override this with a more
2974            efficient implementation.
2975
2976  Returns : integer representing the number of SeqFeatures
2977  Args    : None
2978
2979 At some day we may want to expand this method to allow for a feature
2980 filter to be passed in.
2981
2982 Our default implementation allows for any number of additional
2983 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
2984 support filter arguments, just support them in get_SeqFeatures().
2985
2986 sub feature_count {
2987     my ($self) = @_;
2988
2989     if (defined($self->{'_as_feat'})) {
2990         return ($#{$self->{'_as_feat'}} + 1);
2991     } else {
2992         return 0;
2993     }
2994 }
2995
2996 =head2 get_all_SeqFeatures
2997
2998  Title   : get_all_SeqFeatures
2999  Usage   :
3000  Function: Get the flattened tree of feature objects held by this
3001            feature holder. The difference to get_SeqFeatures is that
3002            the entire tree of sub-features will be flattened out.
3003
3004            We provide a default implementation here, so implementors
3005            don''t necessarily need to implement this method.
3006
3007  Example :
3008  Returns : an array of Bio::SeqFeatureI implementing objects
3009  Args    : none
3010
3011 At some day we may want to expand this method to allow for a feature
3012 filter to be passed in.
3013
3014 Our default implementation allows for any number of additional
3015 arguments and will pass them on to any invocation of
3016 get_SeqFeatures(), wherever a component of the tree implements
3017 FeatureHolderI. I.e., in order to support filter arguments, just
3018 support them in get_SeqFeatures().
3019
3020 =cut
3021
3022 =head2 methods for Bio::AnnotatableI
3023
3024 AnnotatableI implementation to support sequence alignments which
3025 contain annotation (NEXUS, Stockholm).
3026
3027 =head2 annotation
3028
3029  Title   : annotation
3030  Usage   : $ann = $aln->annotation or
3031            $aln->annotation($ann)
3032  Function: Gets or sets the annotation
3033  Returns : Bio::AnnotationCollectionI object
3034  Args    : None or Bio::AnnotationCollectionI object
3035
3036 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3037 for more information
3038
3039 =cut
3040
3041 sub annotation {
3042     my ($obj,$value) = @_;
3043     if( defined $value ) {
3044         $obj->throw("object of class ".ref($value)." does not implement ".
3045                 "Bio::AnnotationCollectionI. Too bad.")
3046             unless $value->isa("Bio::AnnotationCollectionI");
3047         $obj->{'_annotation'} = $value;
3048     } elsif( ! defined $obj->{'_annotation'}) {
3049         $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3050     }
3051     return $obj->{'_annotation'};
3052 }
3053
3054
3055 1;