Bio/SimpleAlign.pm

   1
   2 #
   3 # BioPerl module for SimpleAlign
   4 #
   5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 SimpleAlign - Multiple alignments held as a set of sequences
  16
  17 =head1 SYNOPSIS
  18
  19     $aln = new Bio::SimpleAlign;
  20
  21     $aln->read_MSF(\*STDIN);
  22
  23     $aln->write_fasta(\*STDOUT);
  24
  25 =head1 INSTALLATION
  26
  27 This module is included with the central Bioperl distribution:
  28
  29    http://bio.perl.org/Core/Latest
  30    ftp://bio.perl.org/pub/DIST
  31
  32 Follow the installation instructions included in the README file.
  33
  34 =head1 DESCRIPTION
  35
  36 SimpleAlign handles multiple alignments of sequences. It is very permissive
  37 of types (it wont insist on things being all same length etc): really
  38 it is a SequenceSet explicitly held in memory with a whole series of
  39 built in manipulations and especially file format systems for
  40 read/writing alignments.
  41
  42 SimpleAlign basically views an alignment as an immutable block of text.
  43 SimpleAlign *is not* the object to be using if you want to manipulate an
  44 alignment (eg, truncate an alignment or remove columns that are all gaps).
  45 These functions are much better done by UnivAln by Georg Fuellen.
  46
  47 However for lightweight display/formatting - this is the one to use.
  48
  49 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
  50 the alignment, and this is the key for the internal hashes.
  51 (name,start,end is abreviated nse in the code). However, in many cases
  52 people don't want the name/start-end to be displayed: either multiple
  53 names in an alignment or names specific to the alignment
  54 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
  55 'displayname', and generally is what is used to print out the
  56 alignment. They default to name/start-end
  57
  58 The SimpleAlign Module came from Ewan Birney's Align module
  59
  60 =head1 PROGRESS
  61
  62 SimpleAlign is being slowly converted to bioperl coding standards,
  63 mainly by Ewan.
  64
  65 =over
  66
  67 =item Use Bio::Root::Object - done
  68
  69 =item Use proper exceptions - done
  70
  71 =item Use hashed constructor - not done!
  72
  73 =end
  74
  75 =head1 FEEDBACK
  76
  77 =head2 Mailing Lists
  78
  79 User feedback is an integral part of the evolution of this and other Bioperl modules.
  80 Send your comments and suggestions preferably to one of the Bioperl mailing lists.
  81 Your participation is much appreciated.
  82
  83     vsns-bcd-perl@lists.uni-bielefeld.de          - General discussion
  84     vsns-bcd-perl-guts@lists.uni-bielefeld.de     - Technically-oriented discussion
  85     http://bio.perl.org/MailList.html             - About the mailing lists
  86
  87 =head2 Reporting Bugs
  88
  89 Report bugs to the Bioperl bug tracking system to help us keep track the bugs and
  90 their resolution. Bug reports can be submitted via email or the web:
  91
  92     bioperl-bugs@bio.perl.org
  93     http://bio.perl.org/bioperl-bugs/
  94
  95 =head1 AUTHOR
  96
  97 Ewan Birney, birney@sanger.ac.uk
  98
  99 =head1 SEE ALSO
 100
 101  Bio::Seq.pm - The biosequence object
 102
 103  http://bio.perl.org/Projects/modules.html  - Online module documentation
 104  http://bio.perl.org/Projects/SeqAlign/     - Bioperl sequence alignment project
 105  http://bio.perl.org/                       - Bioperl Project Homepage
 106
 107 =head1 APPENDIX
 108
 109 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
 110
 111 =cut
 112
 113
 114 # Let the code begin...
 115
 116
 117 package Bio::SimpleAlign;
 118 use vars qw($AUTOLOAD @ISA);
 119 use strict;
 120
 121 # Object preamble - inheriets from Bio::Root::Object
 122
 123 use Bio::Root::Object;
 124 use Bio::Seq;         # uses Seq's as list
 125
 126
 127 @ISA = qw(Bio::Root::Object Exporter);
 128 # new() is inherited from Bio::Root::Object
 129
 130 # _initialize is where the heavy stuff will happen when new is called
 131
 132 sub _initialize {
 133   my($self,@args) = @_;
 134
 135   my $make = $self->SUPER::_initialize;
 136
 137   # we need to set up internal hashs first!
 138
 139   $self->{'seq'} = {};
 140   $self->{'order'} = {};
 141   $self->{'dis_name'} = {};
 142   $self->{'id'} = 'NoName';
 143
 144   # maybe we should automatically read in from args. Hmmm...
 145
 146 # set stuff in self from @args
 147   return $make; # success - we hope!
 148 }
 149
 150
 151
 152
 153 =head2 id
 154
 155  Title     : id
 156  Usage     : $myalign->id("Ig")
 157  Function  : Gets/sets the id field of the alignment
 158            :
 159  Returns   : An id string
 160  Argument  : An id string (optional)
 161
 162 =cut
 163
 164 sub id {
 165     my ($self, $name) = @_;
 166
 167     if (defined( $name )) {
 168         $self->{'id'} = $name;
 169     }
 170
 171     return $self->{'id'};
 172 }
 173
 174
 175
 176
 177 =head2 addSeq
 178
 179  Title     : addSeq
 180  Usage     : $myalign->addSeq($newseq);
 181            :
 182            :
 183  Function  : Adds another sequence to the alignment
 184            : *doesn't* align it - just adds it to the
 185            : hashes
 186            :
 187  Returns   : nothing
 188  Argument  :
 189
 190 =cut
 191
 192
 193 sub addSeq {
 194     my $self = shift;
 195     my $seq  = shift;
 196     my $order = shift;
 197     my ($name,$id,$start,$end);
 198
 199     $id = $seq->id();
 200     $start = $seq->start();
 201     $end  = $seq->end();
 202
 203
 204     if( !defined $order ) {
 205         $order = keys %{$self->{'seq'}};
 206     }
 207
 208     $name = sprintf("%s-%d-%d",$id,$start,$end);
 209
 210     if( $self->{'seq'}->{$name} ) {
 211         $self->warn("Replacing one sequence [$name]\n");
 212     }
 213
 214     $self->{'seq'}->{$name} = $seq;
 215     $self->{'order'}->{$order} = $name;
 216
 217 }
 218
 219
 220 =head2 removeSeq
 221
 222  Title     : removeSeq
 223  Usage     : $aln->removeSeq($seq);
 224  Function  : removes a single sequence from an alignment
 225
 226 =cut
 227
 228
 229
 230 sub removeSeq {
 231     my $self = shift;
 232     my $seq = shift;
 233     my ($name,$id,$start,$end);
 234
 235     $seq->out_fasta(\*STDOUT);
 236     $id = $seq->id();
 237     $start = $seq->start();
 238     $end  = $seq->end();
 239     $name = sprintf("%s-%d-%d",$id,$start,$end);
 240
 241     if( !exists $self->{'seq'}->{$name} ) {
 242         $self->throw("Sequence $name does not exist in the alignment to remove!");
 243     }
 244
 245     delete $self->{'seq'}->{$name};
 246
 247     # we can't do anything about the order hash but that is ok
 248     # because eachSeq will handle it
 249 }
 250
 251
 252 =head2 eachSeq
 253
 254  Title     : eachSeq
 255  Usage     : foreach $seq ( $align->eachSeq() )
 256            :
 257            :
 258  Function  : gets an array of Seq objects from the
 259            : alignment
 260            :
 261            :
 262  Returns   : an array
 263  Argument  : nothing
 264
 265 =cut
 266
 267 sub eachSeq {
 268     my $self = shift;
 269     my (@arr,$order);
 270
 271     foreach $order ( sort { $a <=> $b } keys %{$self->{'order'}} ) {
 272         if( exists $self->{'seq'}->{$self->{'order'}->{$order}} ) {
 273             #print STDOUT sprintf("Putting %s\n", $self->{'order'}->{$order});
 274             push(@arr,$self->{'seq'}->{$self->{'order'}->{$order}});
 275         }
 276     }
 277
 278     return @arr;
 279 }
 280
 281
 282
 283 =head2 consensus_string
 284
 285  Title     : consensus_string
 286  Usage     : $str = $ali->consensus_string()
 287            :
 288            :
 289  Function  : Makes a consensus
 290            :
 291            :
 292            :
 293  Returns   :
 294  Argument  :
 295
 296 =cut
 297
 298 sub consensus_string {
 299     my $self = shift;
 300     my $len;
 301     my ($out,$count);
 302
 303     $out = "";
 304
 305     $len = $self->length_aln();
 306
 307     foreach $count ( 0 .. $len ) {
 308         $out .= $self->consensus_aa($count);
 309     }
 310
 311     return $out;
 312
 313 }
 314
 315 sub consensus_aa {
 316     my $self = shift;
 317     my $point = shift;
 318     my ($seq,%hash,$count,$letter,$key);
 319
 320
 321     foreach $seq ( $self->eachSeq() ) {
 322         $letter = substr($seq->{'seq'},$point,1);
 323         ($letter =~ /\./) && next;
 324         # print "Looking at $letter\n";
 325         $hash{$letter}++;
 326     }
 327
 328     $count = -1;
 329     $letter = '?';
 330
 331     foreach $key ( keys %hash ) {
 332         # print "Now at $key $hash{$key}\n";
 333         if( $hash{$key} > $count ) {
 334             $letter = $key;
 335             $count = $hash{$key};
 336         }
 337     }
 338     return $letter;
 339
 340 }
 341
 342
 343
 344 =head2 read_MSF
 345
 346  Title   : read_MSF
 347  Usage   : $al->read_MSF(\*STDIN);
 348  Function: reads MSF formatted files. Tries to read *all* MSF
 349           It reads all non whitespace characters in the alignment
 350           area. For MSFs with weird gaps (eg ~~~) map them by using
 351           $al->map_chars('~','-');
 352  Example :
 353  Returns :
 354  Args    : filehandle
 355
 356
 357 =cut
 358
 359 sub read_MSF{
 360    my ($self,$fh) = @_;
 361    my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
 362
 363    # read in the name section
 364
 365    while( <$fh> ) {
 366        /\/\// && last; # move to alignment section
 367        /Name:\s+(\S+)/ && do { $name = $1;
 368                                $hash{$name} = ""; # blank line
 369                                push(@names,$name); # we need it ordered!
 370                            };
 371        # otherwise - skip
 372    }
 373
 374    # alignment section
 375
 376    while( <$fh> ) {
 377        /^\s+(\S+)\s+(.*)$/ && do {
 378            $name = $1;
 379            $str = $2;
 380            if( ! exists $hash{$name} ) {
 381                $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
 382            }
 383            $str =~ s/\s//g;
 384            $hash{$name} .= $str;
 385        };
 386    }
 387
 388    # now got this as a name - sequence hash. Lets make some sequences!
 389
 390    $count = 0;
 391
 392    foreach $name ( @names ) {
 393        if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
 394            $seqname = $1;
 395            $start = $2;
 396            $end = $3;
 397        } else {
 398            $seqname=$name;
 399            $start = 1;
 400            $str = $hash{$name};
 401            $str =~ s/[^A-Za-z]//g;
 402            $end = length($str);
 403        }
 404
 405        $seq = new Bio::Seq('-seq'=>$hash{$name},
 406                            '-id'=>$seqname,
 407                            '-start'=>$start,
 408                            '-end'=>$end,
 409                            '-type'=>'aligned');
 410
 411        $self->addSeq($seq);
 412
 413        $count++;
 414    }
 415
 416    return $count;
 417 }
 418
 419 =head2 write_MSF
 420
 421  Title     : write_MSF
 422  Usage     : $ali->write_MSF(\*FH)
 423            :
 424            :
 425  Function  : writes MSF format output
 426            :
 427            :
 428            :
 429  Returns   :
 430  Argument  :
 431
 432 =cut
 433
 434 sub write_MSF {
 435     my $self = shift;
 436     my $file = shift;
 437     my $msftag;
 438     my $type;
 439     my $count = 0;
 440     my $maxname;
 441     my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
 442
 443     $date = localtime(time);
 444     $msftag = "MSF";
 445     $type = "P";
 446     $maxname = $self->maxnse_length();
 447     $length  = $self->length_aln();
 448     $name = $self->id();
 449     if( !defined $name ) {
 450         $name = "Align";
 451     }
 452
 453     print $file sprintf("\n%s   MSF: %d  Type: P  %s  Check: 00 ..\n\n",$name,$self->no_sequences,$date);
 454
 455     foreach $seq ( $self->eachSeq() ) {
 456         $name = $self->get_displayname($seq->get_nse());
 457         $miss = $maxname - length ($name);
 458         $miss += 2;
 459         $pad  = " " x $miss;
 460
 461         print $file sprintf(" Name: %s%sLen:    %d  Check:  %d  Weight:  1.00\n",$name,$pad,length $seq->str(),$seq->GCG_checksum());
 462
 463         $hash{$name} = $seq->str();
 464         push(@arr,$name);
 465     }
 466
 467     #
 468     # ok - heavy handed, but there you go.
 469     #
 470     print "\n//\n";
 471
 472     while( $count < $length ) {
 473
 474         # there is another block to go!
 475
 476         foreach $name  ( @arr ) {
 477             print $file sprintf("%20s  ",$name);
 478
 479             $tempcount = $count;
 480             $index = 0;
 481             while( ($tempcount + 10 < $length) && ($index < 5)  ) {
 482
 483                 print $file sprintf("%s ",substr($hash{$name},$tempcount,10));
 484
 485                 $tempcount += 10;
 486                 $index++;
 487             }
 488
 489             #
 490             # ok, could be the very last guy ;)
 491             #
 492
 493             if( $index < 5) {
 494
 495                 #
 496                 # space to print!
 497                 #
 498
 499                 print $file sprintf("%s ",substr($hash{$name},$tempcount));
 500                 $tempcount += 10;
 501             }
 502
 503             print $file "\n";
 504         } # end of each sequence
 505
 506         print "\n\n";
 507
 508         $count = $tempcount;
 509     }
 510 }
 511
 512
 513 sub maxname_length {
 514     my $self = shift;
 515     my $maxname = (-1);
 516     my ($seq,$len);
 517
 518     foreach $seq ( $self->eachSeq() ) {
 519         $len = length $seq->id();
 520
 521         if( $len > $maxname ) {
 522             $maxname = $len;
 523         }
 524     }
 525
 526     return $maxname;
 527 }
 528
 529 sub maxnse_length {
 530     my $self = shift;
 531     my $maxname = (-1);
 532     my ($seq,$len);
 533
 534     foreach $seq ( $self->eachSeq() ) {
 535         $len = length $seq->get_nse();
 536
 537         if( $len > $maxname ) {
 538             $maxname = $len;
 539         }
 540     }
 541
 542     return $maxname;
 543 }
 544
 545 sub maxdisplayname_length {
 546     my $self = shift;
 547     my $maxname = (-1);
 548     my ($seq,$len);
 549
 550     foreach $seq ( $self->eachSeq() ) {
 551         $len = length $self->get_displayname($seq->get_nse());
 552
 553         if( $len > $maxname ) {
 554             $maxname = $len;
 555         }
 556     }
 557
 558     return $maxname;
 559 }
 560
 561
 562 =head2 length_aln
 563
 564  Title     : length_aln()
 565  Usage     : $len = $ali->length_aln()
 566            :
 567            :
 568  Function  : returns the maximum length of the alignment.
 569            : To be sure the alignment is a block, use is_flush
 570            :
 571            :
 572  Returns   :
 573  Argument  :
 574
 575 =cut
 576
 577 sub length_aln {
 578     my $self = shift;
 579     my $seq;
 580     my $length = (-1);
 581     my ($temp,$len);
 582
 583     foreach $seq ( $self->eachSeq() ) {
 584         $temp = length($seq->str());
 585         if( $temp > $length ) {
 586             $length = $temp;
 587         }
 588     }
 589
 590     return $length;
 591 }
 592
 593
 594 =head2 is_flush
 595
 596  Title     : is_flush
 597  Usage     : if( $ali->is_flush() )
 598            :
 599            :
 600  Function  : Tells you whether the alignment
 601            : is flush, ie all of the same length
 602            :
 603            :
 604  Returns   : 1 or 0
 605  Argument  :
 606
 607 =cut
 608
 609 sub is_flush {
 610     my $self = shift;
 611     my $seq;
 612     my $length = (-1);
 613     my $temp;
 614
 615     foreach $seq ( $self->eachSeq() ) {
 616         if( $length == (-1) ) {
 617             $length = length($seq->str());
 618             next;
 619         }
 620
 621         $temp = length($seq->str());
 622
 623         if( $temp != $length ) {
 624             return 0;
 625         }
 626     }
 627
 628     return 1;
 629 }
 630
 631
 632 =head2 read_fasta
 633
 634  Title     : read_fasta
 635  Usage     : $ali->read_fasta(\*INPUT)
 636            :
 637            :
 638  Function  : reads in a fasta formatted
 639            : file for an alignment
 640            :
 641            :
 642  Returns   :
 643  Argument  :
 644
 645 =cut
 646
 647 sub read_fasta {
 648     my $self = shift;
 649     my $in = shift;
 650     my $count = 0;
 651     my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align);
 652
 653     while( <$in> ) {
 654         if( /^>(\S+)/ ) {
 655             $tempname = $1;
 656             if( defined $name ) {
 657                 # put away last name and sequence
 658
 659                 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
 660                     $seqname = $1;
 661                     $start = $2;
 662                     $end = $3;
 663                 } else {
 664                     $seqname=$name;
 665                     $start = 1;
 666                     $end = length($align{$name});
 667                 }
 668
 669                 $seq = new Bio::Seq('-seq'=>$seqchar,
 670                                     '-id'=>$seqname,
 671                                     '-start'=>$start,
 672                                     '-end'=>$end,
 673                                     '-type'=>'aligned');
 674
 675                 $self->addSeq($seq);
 676
 677                 $count++;
 678             }
 679             $name = $tempname;
 680             $seqchar  = "";
 681             next;
 682         }
 683         s/[^A-Za-z\.\-]//g;
 684         $seqchar .= $_;
 685
 686     }
 687     # put away last name and sequence
 688
 689     if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
 690         $seqname = $1;
 691         $start = $2;
 692         $end = $3;
 693     } else {
 694         $seqname=$name;
 695         $start = 1;
 696         $end = length($align{$name});
 697     }
 698
 699     $seq = new Bio::Seq('-seq'=>$seqchar,
 700                         '-id'=>$seqname,
 701                         '-start'=>$start,
 702                         '-end'=>$end,
 703                         '-type'=>'aligned');
 704
 705     $self->addSeq($seq);
 706
 707     $count++;
 708
 709     return $count;
 710 }
 711
 712
 713 =head2 read_selex
 714
 715  Title     : read_selex
 716  Usage     : $ali->read_selex(\*INPUT)
 717            :
 718            :
 719  Function  : reads selex (hmmer) format
 720            : alignments
 721            :
 722            :
 723  Returns   :
 724  Argument  :
 725
 726 =cut
 727
 728 sub read_selex {
 729     my $self = shift;
 730     my $in = shift;
 731     my ($start,$end,%align,$name,$seqname,$seq,$count,%hash,%c2name,$no);
 732
 733     # not quite sure 'what' selex format is, but here we go!
 734
 735     while( <$in> ) {
 736         #/^#=SQ/ && last;
 737         /^#=RF/ && last;
 738     }
 739
 740     while( <$in> ) {
 741         /^#/ && next;
 742         /^\s/ && next;
 743         /\w/ && last;
 744     }
 745
 746     if( eof($in) ) {
 747         warn("Exhausted file without reading selex format");
 748         return undef;
 749     }
 750     # first line starting with word
 751     chop;
 752
 753     $count =0;
 754
 755     if( !/^(\S+)\s+([A-Za-z\.\-]+)\s*/ ) {
 756         warn("Ok. I think [$_] does not look like selex format. Not sure...");
 757         return (-1);
 758     }
 759
 760     $name = $1;
 761     $seq = $2;
 762
 763     $align{$name} .= $seq;
 764     $c2name{$count} = $name;
 765     $count++;
 766
 767     while( <$in> ) {
 768         !/^([^\#]\S+)\s+([A-Za-z\.\-]+)\s*/ && next;
 769
 770         $name = $1;
 771         $seq = $2;
 772
 773         if( ! defined $align{$name}  ) {
 774             $c2name{$count} = $name;
 775             $count++;
 776         }
 777
 778         $align{$name} .= $seq;
 779     }
 780
 781     # ok... now we can make the sequences
 782
 783     $count = 0;
 784     foreach $no ( sort { $a <=> $b } keys %c2name ) {
 785         $name = $c2name{$no};
 786
 787         if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
 788             $seqname = $1;
 789             $start = $2;
 790             $end = $3;
 791         } else {
 792             $seqname=$name;
 793             $start = 1;
 794             $end = length($align{$name});
 795         }
 796
 797         $seq = new Bio::Seq('-seq'=>$align{$name},
 798                             '-id'=>$seqname,
 799                             '-start'=>$start,
 800                             '-end'=>$end,
 801                             '-type'=>'aligned'
 802                             );
 803
 804         $self->addSeq($seq);
 805
 806         $count++;
 807     }
 808
 809     return $count;
 810 }
 811
 812
 813 =head2 read_mase
 814
 815  Title     : read_mase
 816  Usage     : $ali->read_mase(\*INPUT)
 817            :
 818            :
 819  Function  : reads mase (seaview)
 820            : formatted alignments
 821            :
 822            :
 823  Returns   :
 824  Argument  :
 825
 826 =cut
 827
 828 sub read_mase {
 829     my $self = shift;
 830     my $in = shift;
 831     my $name;
 832     my $start;
 833     my $end;
 834     my $seq;
 835     my $add;
 836     my $count = 0;
 837
 838
 839     while( <$in> ) {
 840         /^;/ && next;
 841         if( /^(\S+)\/(\d+)-(\d+)/ ) {
 842             $name = $1;
 843             $start = $2;
 844             $end = $3;
 845         } else {
 846             s/\s//g;
 847             $name = $_;
 848             $end = -1;
 849         }
 850
 851         $seq = "";
 852
 853         while( <$in> ) {
 854             /^;/ && last;
 855             s/[^A-Za-z\.\-]//g;
 856             $seq .= $_;
 857         }
 858         if( $end == -1) {
 859             $start = 1;
 860             $end = length($seq);
 861         }
 862
 863         $add = new Bio::Seq('-seq'=>$seq,
 864                             '-id'=>$name,
 865                             '-start'=>$start,
 866                             '-end'=>$end,
 867                             '-type'=>'aligned');
 868
 869
 870
 871         $self->addSeq($add);
 872
 873         $count++;
 874     }
 875
 876     return $count;
 877 }
 878
 879
 880
 881 =head2 read_Pfam_file
 882
 883  Title     : read_Pfam_file
 884  Usage     : $ali->read_Pfam_file("thisfile");
 885            :
 886  Function  : opens a filename, reads
 887            : a Pfam (mul) formatted alignment
 888            :
 889            :
 890            :
 891  Returns   :
 892  Argument  :
 893
 894 =cut
 895
 896 sub read_Pfam_file {
 897     my $self = shift;
 898     my $file = shift;
 899     my $out;
 900
 901     if( open(_TEMPFILE,$file) == 0 ) {
 902         $self->throw("Could not open $file for reading Pfam style alignment");
 903         return 0;
 904     }
 905
 906     $out = read_Pfam($self,\*_TEMPFILE);
 907
 908     close(_TEMPFILE);
 909
 910     return $out;
 911 }
 912
 913
 914 =head2 read_Pfam
 915
 916  Title     : read_Pfam
 917  Usage     : $ali->read_Pfam(\*INPUT)
 918            :
 919            :
 920  Function  : reads a Pfam formatted
 921            : Alignment (Mul format).
 922            : - this is the format used by Belvu
 923            :
 924  Returns   :
 925  Argument  :
 926
 927 =cut
 928
 929 sub read_Pfam {
 930     my $self = shift;
 931     my $in = shift;
 932     my $name;
 933     my $start;
 934     my $end;
 935     my $seq;
 936     my $add;
 937     my $acc;
 938     my %names;
 939     my $count = 0;
 940
 941     while( <$in> ) {
 942         chop;
 943         /^\/\// && last;
 944         if( !/^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) {
 945             $self->throw("Found a bad line [$_] in the pfam format alignment");
 946             next;
 947         }
 948
 949         $name = $1;
 950         $start = $2;
 951         $end = $3;
 952         $seq = $4;
 953
 954
 955         # there may be an accession number at the end of the line
 956
 957         ($acc) = ($' =~ /\s*(\S+)\s*/);
 958         $names{'acc'} = $acc;
 959
 960         $add = new Bio::Seq('-seq'=>$seq,
 961                             '-id'=>$name,
 962                             '-names'=>\%names,
 963                             '-start'=>$start,
 964                             '-end'=>$end,
 965                             '-type'=>'aligned');
 966
 967         $self->addSeq($add);
 968
 969         $count++;
 970     }
 971
 972     return $count;
 973 }
 974
 975 sub write_Pfam_link {
 976     my $self = shift;
 977     my $link = shift;
 978     my $out  = shift;
 979     my $len  = shift;
 980     my $acc   = shift;
 981     my ($namestr,$linkstr,$seq,$name,$start,$end,$disname,$add,$place);
 982
 983     if( !defined $len ) {
 984         $len = 22;
 985     }
 986
 987     foreach $seq ( $self->eachSeq() ) {
 988         $name = $seq->id();
 989         $disname = $self->get_displayname($seq->get_nse());
 990
 991         $add = $len - length($disname);
 992         $place = " " x $add;
 993
 994
 995 #       print $out "Going to eval \$linkstr = $link\n";
 996         eval ("\$linkstr = \"$link\"");
 997         if( defined $acc ) {
 998             print $out sprintf("%s%s%s %s\n",$linkstr,$place,$seq->str(),$seq->names()->{'acc'});
 999         } else {
1000             print $out sprintf("%s%s%s\n",$linkstr,$place,$seq->str());
1001         }
1002     }
1003 }
1004
1005
1006 =head2 write_Pfam
1007
1008  Title     : write_Pfam
1009  Usage     : $ali->write_Pfam(\*OUTPUT)
1010            :
1011            :
1012  Function  : writes a Pfam/Mul formatted
1013            : file
1014            :
1015            :
1016  Returns   :
1017  Argument  :
1018
1019 =cut
1020
1021 sub write_Pfam {
1022     my $self = shift;
1023     my $out  = shift;
1024     my $acc  = shift;
1025     my ($namestr,$seq,$add);
1026     my ($maxn);
1027
1028     $maxn = $self->maxdisplayname_length();
1029
1030     foreach $seq ( $self->eachSeq() ) {
1031         $namestr = $self->get_displayname($seq->get_nse());
1032         $add = $maxn - length($namestr) + 2;
1033         $namestr .= " " x $add;
1034         if( defined $acc && $acc == 1) {
1035             print $out sprintf("%s  %s %s\n",$namestr,$seq->str(),$seq->names()->{'acc'});
1036         } else {
1037             print $out sprintf("%s  %s\n",$namestr,$seq->str());
1038         }
1039     }
1040 }
1041
1042
1043 =head2 write_clustalw
1044
1045  Title     : write_clustalw
1046  Usage     : $ali->write_clustalw
1047            :
1048            :
1049  Function  : writes a clustalw formatted
1050            : (.aln) file
1051            :
1052            :
1053  Returns   :
1054  Argument  :
1055
1056 =cut
1057
1058 sub write_clustalw {
1059     my $self = shift;
1060     my $file = shift;
1061     my ($count,$length,$seq,@seq,$tempcount);
1062
1063     print $file "CLUSTAL W(1.4) multiple sequence alignment\n\n\n";
1064
1065     $length = $self->length_aln();
1066     $count = $tempcount = 0;
1067     @seq = $self->eachSeq();
1068
1069     while( $count < $length ) {
1070         foreach $seq ( @seq ) {
1071             print $file sprintf("%-22s %s\n",$self->get_displayname($seq->get_nse()),substr($seq->str(),$count,50));
1072         }
1073         print $file "\n\n";
1074         $count += 50;
1075     }
1076
1077
1078 }
1079
1080
1081 =head2 write_fasta
1082
1083  Title     : write_fasta
1084  Usage     : $ali->write_fasta(\*OUTPUT)
1085            :
1086            :
1087  Function  : writes a fasta formatted alignment
1088            :
1089  Returns   :
1090  Argument  : reference-to-glob to file or filehandle object
1091
1092 =cut
1093
1094 sub write_fasta {
1095     my $self = shift;
1096     my $file  = shift;
1097     my ($seq,$rseq,$name,$count,$length,$seqsub);
1098
1099     foreach $rseq ( $self->eachSeq() ) {
1100         $name = $self->get_displayname($rseq->get_nse());
1101         $seq  = $rseq->str();
1102
1103         print $file ">$name\n";
1104
1105         $count =0;
1106         $length = length($seq);
1107         while( ($count * 60 ) < $length ) {
1108             $seqsub = substr($seq,$count*60,60);
1109             print $file "$seqsub\n";
1110             $count++;
1111         }
1112     }
1113 }
1114
1115
1116
1117 sub set_displayname {
1118     my $self = shift;
1119     my $name = shift;
1120     my $disname = shift;
1121
1122     # print "Setting $name to $disname\n";
1123     $self->{dis_name}->{$name} = $disname;
1124 }
1125
1126 sub get_displayname {
1127     my $self = shift;
1128     my $name = shift;
1129
1130     if( defined $self->{dis_name}->{$name} ) {
1131         return  $self->{dis_name}->{$name};
1132     } else {
1133         return $name;
1134     }
1135 }
1136
1137 =head2 set_displayname_flat
1138
1139  Title     : set_displayname_flat
1140  Usage     : $ali->set_displayname_flat()
1141            :
1142            :
1143  Function  : Makes all the sequences be displayed
1144            : as just their name, not name/start-end
1145            :
1146            :
1147  Returns   :
1148  Argument  :
1149
1150 =cut
1151
1152 sub set_displayname_flat {
1153     my $self = shift;
1154     my ($nse,$seq);
1155
1156     foreach $seq ( $self->eachSeq() ) {
1157         $nse = $seq->get_nse();
1158         $self->set_displayname($nse,$seq->id());
1159     }
1160 }
1161
1162 =head2 set_displayname_normal
1163
1164  Title     : set_displayname_normal
1165  Usage     : $ali->set_displayname_normal()
1166            :
1167            :
1168  Function  : Makes all the sequences be displayed
1169            : as name/start-end
1170            :
1171            :
1172  Returns   :
1173  Argument  :
1174
1175 =cut
1176
1177 sub set_displayname_normal {
1178     my $self = shift;
1179     my ($nse,$seq);
1180
1181     foreach $seq ( $self->eachSeq() ) {
1182         $nse = $seq->get_nse();
1183         $self->set_displayname($nse,sprintf("%s/%d-%d",$seq->id(),$seq->start(),$seq->end()));
1184     }
1185 }
1186
1187 =head2 set_displayname_count
1188
1189  Title     : set_displayname_count
1190  Usage     : $ali->set_displayname_count
1191            :
1192            :
1193  Function  : sets the names to be name_#
1194            : where # is the number of times this
1195            : name has been used.
1196            :
1197  Returns   :
1198  Argument  :
1199
1200 =cut
1201
1202
1203 sub set_displayname_count {
1204     my $self= shift;
1205     my (@arr,$name,$seq,$count,$temp,$nse);
1206
1207     foreach $seq ( $self->each_alphabetically() ) {
1208         $nse = $seq->get_nse();
1209
1210         #name will be set when this is the second
1211         #time (or greater) is has been seen
1212
1213         if( $name eq ($seq->id()) ) {
1214             $temp = sprintf("%s_%s",$name,$count);
1215             $self->set_displayname($nse,$temp);
1216             $count++;
1217         } else {
1218             $count = 1;
1219             $name = $seq->id();
1220             $temp = sprintf("%s_%s",$name,$count);
1221             $self->set_displayname($nse,$temp);
1222             $count++;
1223         }
1224     }
1225
1226 }
1227
1228 =head2 each_alphabetically
1229
1230  Title     : each_alphabetically
1231  Usage     : foreach $seq ( $ali->each_alphabetically() )
1232            :
1233            :
1234  Function  : returns an array of sequence object sorted
1235            : alphabetically by name and then by start point
1236            :
1237            : Does not change the order of the alignment
1238  Returns   :
1239  Argument  :
1240
1241 =cut
1242
1243
1244 sub each_alphabetically {
1245     my $self = shift;
1246     my ($seq,$nse,@arr,%hash,$count);
1247
1248     foreach $seq ( $self->eachSeq() ) {
1249         $nse = $seq->get_nse("-","-");
1250         $hash{$nse} = $seq;
1251     }
1252
1253
1254     foreach $nse ( sort alpha_startend keys %hash) {
1255         push(@arr,$hash{$nse});
1256     }
1257
1258     return @arr;
1259
1260 }
1261
1262
1263 =head2 sort_alphabetically
1264
1265  Title     : sort_alphabetically
1266  Usage     : $ali->sort_alphabetically
1267            :
1268            :
1269  Function  : changes the order of the alignemnt
1270            : to alphabetical on name followed by
1271            : numerical by number
1272            :
1273  Returns   :
1274  Argument  :
1275
1276 =cut
1277
1278 sub sort_alphabetically {
1279     my $self = shift;
1280     my ($seq,$nse,@arr,%hash,$count);
1281
1282
1283     foreach $seq ( $self->eachSeq() ) {
1284         $nse = $seq->get_nse("-","-");
1285         $hash{$nse} = $seq;
1286     }
1287
1288     $count = 0;
1289
1290     %{$self->{'order'}} = (); # reset the hash;
1291
1292     foreach $nse ( sort alpha_startend keys %hash) {
1293         $self->{'order'}->{$count} = $nse;
1294
1295         $count++;
1296     }
1297
1298 }
1299
1300 =head2 map_chars
1301
1302  Title     : map_chars
1303  Usage     : $ali->map_chars('\.','-')
1304            :
1305            :
1306  Function  : does a s/$arg1/$arg2/ on
1307            : the sequences. Useful for
1308            : gap characters
1309            :
1310            : Notice that the from (arg1) is interpretted
1311            : as a regex, so be careful about quoting meta
1312            : characters (eg $ali->map_chars('.','-') wont
1313            : do what you want)
1314  Returns   :
1315  Argument  :
1316
1317 =cut
1318
1319
1320 sub map_chars {
1321     my $self = shift;
1322     my $from = shift;
1323     my $to   = shift;
1324     my ($seq,$temp);
1325
1326     foreach $seq ( $self->eachSeq() ) {
1327         $temp = $seq->str();
1328         $temp =~ s/$from/$to/g;
1329         $seq->setseq($temp);
1330     }
1331 }
1332
1333 =head2 uppercase
1334
1335  Title     : uppercase()
1336  Usage     : $ali->uppercase()
1337            :
1338            :
1339  Function  : Sets all the sequences
1340            : to uppercase
1341            :
1342            :
1343  Returns   :
1344  Argument  :
1345
1346 =cut
1347
1348 sub uppercase {
1349     my $self = shift;
1350     my $seq;
1351     my $temp;
1352
1353     foreach $seq ( $self->eachSeq() ) {
1354       $temp = $seq->str();
1355       $temp =~ tr/[a-z]/[A-Z]/;
1356
1357       $seq->setseq($temp);
1358     }
1359 }
1360
1361
1362 sub alpha_startend {
1363     my ($aname,$astart,$bname,$bstart);
1364     ($aname,$astart) = split (/-/,$a);
1365     ($bname,$bstart) = split (/-/,$b);
1366
1367     if( $aname eq $bname ) {
1368         return $astart <=> $bstart;
1369     }
1370     else {
1371         return $aname cmp $bname;
1372     }
1373
1374 }
1375
1376 =head2 no_sequences
1377
1378  Title     : no_sequences
1379  Usage     : $depth = $ali->no_sequences
1380            :
1381            :
1382  Function  : number of sequence in the
1383            : sequence alignment
1384            :
1385            :
1386  Returns   :
1387  Argument  :
1388
1389 =cut
1390
1391 sub no_sequences {
1392     my $self = shift;
1393
1394     return keys %{$self->{'seq'}};
1395 }
1396
1397 =head2 no_residues
1398
1399  Title     : no_residues
1400  Usage     : $no = $ali->no_residues
1401            :
1402            :
1403  Function  : number of residues in total
1404            : in the alignment
1405            :
1406            :
1407  Returns   :
1408  Argument  :
1409
1410 =cut
1411
1412
1413 sub no_residues {
1414     my $self = shift;
1415     my $count = 0;
1416     my ($key,$temp);
1417
1418     foreach $key ( keys %{$self->{'seq'}} ) {
1419
1420         $temp = $self->{'seq'}->{$key}->str();
1421
1422
1423         $temp =~ s/[^A-Za-z]//g;
1424
1425         $count += length($temp);
1426     }
1427
1428     return $count;
1429 }
1430
1431
1432
1433 =head2 purge
1434
1435  Title   : purge
1436  Usage   : $aln->purge(0.7);
1437  Function: removes sequences above whatever %id
1438  Example :
1439  Returns : An array of the removed sequences
1440  Arguments
1441
1442  This function will grind on large alignments. Beware!
1443
1444  (perhaps not ideally implemented)
1445
1446 =cut
1447
1448 sub purge{
1449   my ($self,$perc) = @_;
1450   my (@seqs,$seq,%removed,$i,$j,$count,@one,@two,$seq2,$k,$res,$ratio,@ret);
1451
1452  # $self->write_Pfam(\*STDOUT);
1453
1454   @seqs = $self->eachSeq();
1455
1456   #$self->write_Pfam(\*STDOUT);
1457
1458 #  foreach $seq ( @seqs ) {
1459 #      printf("$seq %s %s\n",$seq->get_nse(),join(' ',$seq->dump()));
1460 #  }
1461
1462   for($i=0;$i< @seqs;$i++ ) {
1463       $seq = $seqs[$i];
1464       #printf "%s\n", $seq->out_fasta();
1465       #print "\n\nDone\n\n";
1466
1467       # if it has already been removed, skip
1468
1469       if( $removed{$seq->get_nse()} == 1 ) {
1470           next;
1471       }
1472
1473       # if not ... look at the other sequences
1474
1475       # make the first array once
1476
1477       @one = $seq->seq();
1478       for($j=$i+1;$j < @seqs;$j++) {
1479           $seq2 = $seqs[$j];
1480           if ( $removed{$seq2->get_nse()} == 1 ) {
1481               next;
1482           }
1483           @two = $seq2->seq();
1484           $count = 0;
1485           $res = 0;
1486           for($k=0;$k<@one;$k++) {
1487               if( $one[$k] ne '.' && $one[$k] ne '-' && $one[$k] eq $two[$k]) {
1488                   $count++;
1489               }
1490               if( $one[$k] ne '.' && $one[$k] ne '-' && $two[$k] ne '.' && $two[$k] ne '-' ) {
1491                   $res++;
1492               }
1493           }
1494           if( $res == 0 ) {
1495               $ratio = 0;
1496           } else {
1497               $ratio = $count/$res;
1498           }
1499
1500           if( $ratio > $perc ) {
1501               $removed{$seq2->get_nse()} = 1;
1502               $self->removeSeq($seq2);
1503               push(@ret,$seq2);
1504           } else {
1505               # could put a comment here!
1506           }
1507       }
1508   }
1509
1510   return @ret;
1511 }
1512
1513
1514
1515 =head2 percentage_identity
1516
1517  Title   : percentage_identity
1518  Usage   : $id = $align->percentage_identity
1519  Function:
1520     The function uses a fast method to calculate the average percentage identity of the alignment
1521  Returns : The average percentage identity of the alignment
1522  Args    : None
1523
1524 =cut
1525
1526 sub percentage_identity{
1527    my ($self,@args) = @_;
1528
1529    my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
1530                    'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
1531
1532    my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
1533
1534    if (! $self->is_flush()) {
1535        $self->throw("All sequences in the alignment must be the same length");
1536    }
1537
1538    @seqs = $self->eachSeq();
1539    $len = $self->length_aln();
1540
1541    # load the each hash with correct keys for existence checks
1542    for( my $index=0; $index < $len; $index++) {
1543        foreach my $letter (@alphabet) {
1544            $countHashes[$index]->{$letter} = 0;
1545        }
1546    }
1547
1548
1549    foreach my $seq (@seqs)  {
1550        my @seqChars = $seq->ary();
1551        for( my $column=0; $column < @seqChars; $column++ ) {
1552            my $char = uc($seqChars[$column]);
1553            if (exists $countHashes[$column]->{$char}) {
1554                $countHashes[$column]->{$char}++;
1555            }
1556        }
1557    }
1558
1559    $total = 0;
1560    $divisor = 0;
1561    for(my $column =0; $column < $len; $column++) {
1562        my %hash = %{$countHashes[$column]};
1563        $subdivisor = 0;
1564        foreach my $res (keys %hash) {
1565            $total += $hash{$res}*($hash{$res} - 1);
1566            $subdivisor += $hash{$res};
1567        }
1568        $divisor += $subdivisor * ($subdivisor - 1);
1569    }
1570    return ($total / $divisor )*100.0;
1571 }
1572
1573
1574 =head2 read_Prodom
1575
1576  Title   : read_Prodom
1577  Usage   : $ali->read_Prodom( $file )
1578  Function: Reads in a Prodom format alignment
1579  Returns :
1580     Args    : A filehandle glob or ref. to a filehandle object
1581
1582 =cut
1583
1584 sub read_Prodom{
1585    my $self = shift;
1586    my $file = shift;
1587
1588    my ($acc, $fake_id, $start, $end, $seq, $add, %names);
1589
1590    while (<$file>){
1591        if (/^AL\s+(\S+)\|(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)$/){
1592            $acc=$1;
1593            $fake_id=$2;  # Accessions have _species appended
1594            $start=$3;
1595            $end=$4;
1596            $seq=$5;
1597
1598            $names{'fake_id'} = $fake_id;
1599
1600            $add = new Bio::Seq('-seq'=>$seq,
1601                                '-id'=>$acc,
1602                                '-names'=>\%names,
1603                                '-start'=>$start,
1604                                '-end'=>$end,
1605                                '-type'=>'aligned');
1606
1607            $self->addSeq($add);
1608         }
1609     }
1610 }
1611
1612
1613 1;
1614