*** empty log message ***
[bioperl-live.git] / Bio / SimpleAlign.pm
blob1ac961acaffb62e9e2cc523fc7e68994dd2deaba
3 # BioPerl module for SimpleAlign
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 SimpleAlign - Multiple alignments held as a set of sequences
17 =head1 SYNOPSIS
19 $aln = new Bio::SimpleAlign;
21 $aln->read_MSF(\*STDIN);
23 $aln->write_fasta(\*STDOUT);
25 =head1 INSTALLATION
27 This module is included with the central Bioperl distribution:
29 http://bio.perl.org/Core/Latest
30 ftp://bio.perl.org/pub/DIST
32 Follow the installation instructions included in the README file.
34 =head1 DESCRIPTION
36 SimpleAlign handles multiple alignments of sequences. It is very permissive
37 of types (it wont insist on things being all same length etc): really
38 it is a SequenceSet explicitly held in memory with a whole series of
39 built in manipulations and especially file format systems for
40 read/writing alignments.
42 SimpleAlign basically views an alignment as an immutable block of text.
43 SimpleAlign *is not* the object to be using if you want to manipulate an
44 alignment (eg, truncate an alignment or remove columns that are all gaps).
45 These functions are much better done by UnivAln by Georg Fuellen.
47 However for lightweight display/formatting - this is the one to use.
49 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
50 the alignment, and this is the key for the internal hashes.
51 (name,start,end is abreviated nse in the code). However, in many cases
52 people don't want the name/start-end to be displayed: either multiple
53 names in an alignment or names specific to the alignment
54 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
55 'displayname', and generally is what is used to print out the
56 alignment. They default to name/start-end
58 The SimpleAlign Module came from Ewan Birney's Align module
60 =head1 PROGRESS
62 SimpleAlign is being slowly converted to bioperl coding standards,
63 mainly by Ewan.
65 =over
67 =item Use Bio::Root::Object - done
69 =item Use proper exceptions - done
71 =item Use hashed constructor - not done!
73 =end
75 =head1 FEEDBACK
77 =head2 Mailing Lists
79 User feedback is an integral part of the evolution of this and other Bioperl modules.
80 Send your comments and suggestions preferably to one of the Bioperl mailing lists.
81 Your participation is much appreciated.
83 vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
84 vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
85 http://bio.perl.org/MailList.html - About the mailing lists
87 =head2 Reporting Bugs
89 Report bugs to the Bioperl bug tracking system to help us keep track the bugs and
90 their resolution. Bug reports can be submitted via email or the web:
92 bioperl-bugs@bio.perl.org
93 http://bio.perl.org/bioperl-bugs/
95 =head1 AUTHOR
97 Ewan Birney, birney@sanger.ac.uk
99 =head1 SEE ALSO
101 Bio::Seq.pm - The biosequence object
103 http://bio.perl.org/Projects/modules.html - Online module documentation
104 http://bio.perl.org/Projects/SeqAlign/ - Bioperl sequence alignment project
105 http://bio.perl.org/ - Bioperl Project Homepage
107 =head1 APPENDIX
109 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
111 =cut
114 # Let the code begin...
117 package Bio::SimpleAlign;
118 use vars qw($AUTOLOAD @ISA);
119 use strict;
121 # Object preamble - inheriets from Bio::Root::Object
123 use Bio::Root::Object;
124 use Bio::Seq; # uses Seq's as list
127 @ISA = qw(Bio::Root::Object Exporter);
128 # new() is inherited from Bio::Root::Object
130 # _initialize is where the heavy stuff will happen when new is called
132 sub _initialize {
133 my($self,@args) = @_;
135 my $make = $self->SUPER::_initialize;
137 # we need to set up internal hashs first!
139 $self->{'seq'} = {};
140 $self->{'order'} = {};
141 $self->{'dis_name'} = {};
142 $self->{'id'} = 'NoName';
144 # maybe we should automatically read in from args. Hmmm...
146 # set stuff in self from @args
147 return $make; # success - we hope!
153 =head2 id
155 Title : id
156 Usage : $myalign->id("Ig")
157 Function : Gets/sets the id field of the alignment
159 Returns : An id string
160 Argument : An id string (optional)
162 =cut
164 sub id {
165 my ($self, $name) = @_;
167 if (defined( $name )) {
168 $self->{'id'} = $name;
171 return $self->{'id'};
177 =head2 addSeq
179 Title : addSeq
180 Usage : $myalign->addSeq($newseq);
183 Function : Adds another sequence to the alignment
184 : *doesn't* align it - just adds it to the
185 : hashes
187 Returns : nothing
188 Argument :
190 =cut
193 sub addSeq {
194 my $self = shift;
195 my $seq = shift;
196 my $order = shift;
197 my ($name,$id,$start,$end);
199 $id = $seq->id();
200 $start = $seq->start();
201 $end = $seq->end();
204 if( !defined $order ) {
205 $order = keys %{$self->{'seq'}};
208 $name = sprintf("%s-%d-%d",$id,$start,$end);
210 if( $self->{'seq'}->{$name} ) {
211 $self->warn("Replacing one sequence [$name]\n");
214 $self->{'seq'}->{$name} = $seq;
215 $self->{'order'}->{$order} = $name;
220 =head2 removeSeq
222 Title : removeSeq
223 Usage : $aln->removeSeq($seq);
224 Function : removes a single sequence from an alignment
226 =cut
230 sub removeSeq {
231 my $self = shift;
232 my $seq = shift;
233 my ($name,$id,$start,$end);
235 $seq->out_fasta(\*STDOUT);
236 $id = $seq->id();
237 $start = $seq->start();
238 $end = $seq->end();
239 $name = sprintf("%s-%d-%d",$id,$start,$end);
241 if( !exists $self->{'seq'}->{$name} ) {
242 $self->throw("Sequence $name does not exist in the alignment to remove!");
245 delete $self->{'seq'}->{$name};
247 # we can't do anything about the order hash but that is ok
248 # because eachSeq will handle it
252 =head2 eachSeq
254 Title : eachSeq
255 Usage : foreach $seq ( $align->eachSeq() )
258 Function : gets an array of Seq objects from the
259 : alignment
262 Returns : an array
263 Argument : nothing
265 =cut
267 sub eachSeq {
268 my $self = shift;
269 my (@arr,$order);
271 foreach $order ( sort { $a <=> $b } keys %{$self->{'order'}} ) {
272 if( exists $self->{'seq'}->{$self->{'order'}->{$order}} ) {
273 #print STDOUT sprintf("Putting %s\n", $self->{'order'}->{$order});
274 push(@arr,$self->{'seq'}->{$self->{'order'}->{$order}});
278 return @arr;
283 =head2 consensus_string
285 Title : consensus_string
286 Usage : $str = $ali->consensus_string()
289 Function : Makes a consensus
293 Returns :
294 Argument :
296 =cut
298 sub consensus_string {
299 my $self = shift;
300 my $len;
301 my ($out,$count);
303 $out = "";
305 $len = $self->length_aln();
307 foreach $count ( 0 .. $len ) {
308 $out .= $self->consensus_aa($count);
311 return $out;
315 sub consensus_aa {
316 my $self = shift;
317 my $point = shift;
318 my ($seq,%hash,$count,$letter,$key);
321 foreach $seq ( $self->eachSeq() ) {
322 $letter = substr($seq->{'seq'},$point,1);
323 ($letter =~ /\./) && next;
324 # print "Looking at $letter\n";
325 $hash{$letter}++;
328 $count = -1;
329 $letter = '?';
331 foreach $key ( keys %hash ) {
332 # print "Now at $key $hash{$key}\n";
333 if( $hash{$key} > $count ) {
334 $letter = $key;
335 $count = $hash{$key};
338 return $letter;
344 =head2 read_MSF
346 Title : read_MSF
347 Usage : $al->read_MSF(\*STDIN);
348 Function: reads MSF formatted files. Tries to read *all* MSF
349 It reads all non whitespace characters in the alignment
350 area. For MSFs with weird gaps (eg ~~~) map them by using
351 $al->map_chars('~','-');
352 Example :
353 Returns :
354 Args : filehandle
357 =cut
359 sub read_MSF{
360 my ($self,$fh) = @_;
361 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
363 # read in the name section
365 while( <$fh> ) {
366 /\/\// && last; # move to alignment section
367 /Name:\s+(\S+)/ && do { $name = $1;
368 $hash{$name} = ""; # blank line
369 push(@names,$name); # we need it ordered!
371 # otherwise - skip
374 # alignment section
376 while( <$fh> ) {
377 /^\s+(\S+)\s+(.*)$/ && do {
378 $name = $1;
379 $str = $2;
380 if( ! exists $hash{$name} ) {
381 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
383 $str =~ s/\s//g;
384 $hash{$name} .= $str;
388 # now got this as a name - sequence hash. Lets make some sequences!
390 $count = 0;
392 foreach $name ( @names ) {
393 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
394 $seqname = $1;
395 $start = $2;
396 $end = $3;
397 } else {
398 $seqname=$name;
399 $start = 1;
400 $str = $hash{$name};
401 $str =~ s/[^A-Za-z]//g;
402 $end = length($str);
405 $seq = new Bio::Seq('-seq'=>$hash{$name},
406 '-id'=>$seqname,
407 '-start'=>$start,
408 '-end'=>$end,
409 '-type'=>'aligned');
411 $self->addSeq($seq);
413 $count++;
416 return $count;
419 =head2 write_MSF
421 Title : write_MSF
422 Usage : $ali->write_MSF(\*FH)
425 Function : writes MSF format output
429 Returns :
430 Argument :
432 =cut
434 sub write_MSF {
435 my $self = shift;
436 my $file = shift;
437 my $msftag;
438 my $type;
439 my $count = 0;
440 my $maxname;
441 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
443 $date = localtime(time);
444 $msftag = "MSF";
445 $type = "P";
446 $maxname = $self->maxnse_length();
447 $length = $self->length_aln();
448 $name = $self->id();
449 if( !defined $name ) {
450 $name = "Align";
453 print $file sprintf("\n%s MSF: %d Type: P %s Check: 00 ..\n\n",$name,$self->no_sequences,$date);
455 foreach $seq ( $self->eachSeq() ) {
456 $name = $self->get_displayname($seq->get_nse());
457 $miss = $maxname - length ($name);
458 $miss += 2;
459 $pad = " " x $miss;
461 print $file sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->str(),$seq->GCG_checksum());
463 $hash{$name} = $seq->str();
464 push(@arr,$name);
468 # ok - heavy handed, but there you go.
470 print "\n//\n";
472 while( $count < $length ) {
474 # there is another block to go!
476 foreach $name ( @arr ) {
477 print $file sprintf("%20s ",$name);
479 $tempcount = $count;
480 $index = 0;
481 while( ($tempcount + 10 < $length) && ($index < 5) ) {
483 print $file sprintf("%s ",substr($hash{$name},$tempcount,10));
485 $tempcount += 10;
486 $index++;
490 # ok, could be the very last guy ;)
493 if( $index < 5) {
496 # space to print!
499 print $file sprintf("%s ",substr($hash{$name},$tempcount));
500 $tempcount += 10;
503 print $file "\n";
504 } # end of each sequence
506 print "\n\n";
508 $count = $tempcount;
513 sub maxname_length {
514 my $self = shift;
515 my $maxname = (-1);
516 my ($seq,$len);
518 foreach $seq ( $self->eachSeq() ) {
519 $len = length $seq->id();
521 if( $len > $maxname ) {
522 $maxname = $len;
526 return $maxname;
529 sub maxnse_length {
530 my $self = shift;
531 my $maxname = (-1);
532 my ($seq,$len);
534 foreach $seq ( $self->eachSeq() ) {
535 $len = length $seq->get_nse();
537 if( $len > $maxname ) {
538 $maxname = $len;
542 return $maxname;
545 sub maxdisplayname_length {
546 my $self = shift;
547 my $maxname = (-1);
548 my ($seq,$len);
550 foreach $seq ( $self->eachSeq() ) {
551 $len = length $self->get_displayname($seq->get_nse());
553 if( $len > $maxname ) {
554 $maxname = $len;
558 return $maxname;
562 =head2 length_aln
564 Title : length_aln()
565 Usage : $len = $ali->length_aln()
568 Function : returns the maximum length of the alignment.
569 : To be sure the alignment is a block, use is_flush
572 Returns :
573 Argument :
575 =cut
577 sub length_aln {
578 my $self = shift;
579 my $seq;
580 my $length = (-1);
581 my ($temp,$len);
583 foreach $seq ( $self->eachSeq() ) {
584 $temp = length($seq->str());
585 if( $temp > $length ) {
586 $length = $temp;
590 return $length;
594 =head2 is_flush
596 Title : is_flush
597 Usage : if( $ali->is_flush() )
600 Function : Tells you whether the alignment
601 : is flush, ie all of the same length
604 Returns : 1 or 0
605 Argument :
607 =cut
609 sub is_flush {
610 my $self = shift;
611 my $seq;
612 my $length = (-1);
613 my $temp;
615 foreach $seq ( $self->eachSeq() ) {
616 if( $length == (-1) ) {
617 $length = length($seq->str());
618 next;
621 $temp = length($seq->str());
623 if( $temp != $length ) {
624 return 0;
628 return 1;
632 =head2 read_fasta
634 Title : read_fasta
635 Usage : $ali->read_fasta(\*INPUT)
638 Function : reads in a fasta formatted
639 : file for an alignment
642 Returns :
643 Argument :
645 =cut
647 sub read_fasta {
648 my $self = shift;
649 my $in = shift;
650 my $count = 0;
651 my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align);
653 while( <$in> ) {
654 if( /^>(\S+)/ ) {
655 $tempname = $1;
656 if( defined $name ) {
657 # put away last name and sequence
659 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
660 $seqname = $1;
661 $start = $2;
662 $end = $3;
663 } else {
664 $seqname=$name;
665 $start = 1;
666 $end = length($align{$name});
669 $seq = new Bio::Seq('-seq'=>$seqchar,
670 '-id'=>$seqname,
671 '-start'=>$start,
672 '-end'=>$end,
673 '-type'=>'aligned');
675 $self->addSeq($seq);
677 $count++;
679 $name = $tempname;
680 $seqchar = "";
681 next;
683 s/[^A-Za-z\.\-]//g;
684 $seqchar .= $_;
687 # put away last name and sequence
689 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
690 $seqname = $1;
691 $start = $2;
692 $end = $3;
693 } else {
694 $seqname=$name;
695 $start = 1;
696 $end = length($align{$name});
699 $seq = new Bio::Seq('-seq'=>$seqchar,
700 '-id'=>$seqname,
701 '-start'=>$start,
702 '-end'=>$end,
703 '-type'=>'aligned');
705 $self->addSeq($seq);
707 $count++;
709 return $count;
713 =head2 read_selex
715 Title : read_selex
716 Usage : $ali->read_selex(\*INPUT)
719 Function : reads selex (hmmer) format
720 : alignments
723 Returns :
724 Argument :
726 =cut
728 sub read_selex {
729 my $self = shift;
730 my $in = shift;
731 my ($start,$end,%align,$name,$seqname,$seq,$count,%hash,%c2name,$no);
733 # not quite sure 'what' selex format is, but here we go!
735 while( <$in> ) {
736 #/^#=SQ/ && last;
737 /^#=RF/ && last;
740 while( <$in> ) {
741 /^#/ && next;
742 /^\s/ && next;
743 /\w/ && last;
746 if( eof($in) ) {
747 warn("Exhausted file without reading selex format");
748 return undef;
750 # first line starting with word
751 chop;
753 $count =0;
755 if( !/^(\S+)\s+([A-Za-z\.\-]+)\s*/ ) {
756 warn("Ok. I think [$_] does not look like selex format. Not sure...");
757 return (-1);
760 $name = $1;
761 $seq = $2;
763 $align{$name} .= $seq;
764 $c2name{$count} = $name;
765 $count++;
767 while( <$in> ) {
768 !/^([^\#]\S+)\s+([A-Za-z\.\-]+)\s*/ && next;
770 $name = $1;
771 $seq = $2;
773 if( ! defined $align{$name} ) {
774 $c2name{$count} = $name;
775 $count++;
778 $align{$name} .= $seq;
781 # ok... now we can make the sequences
783 $count = 0;
784 foreach $no ( sort { $a <=> $b } keys %c2name ) {
785 $name = $c2name{$no};
787 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
788 $seqname = $1;
789 $start = $2;
790 $end = $3;
791 } else {
792 $seqname=$name;
793 $start = 1;
794 $end = length($align{$name});
797 $seq = new Bio::Seq('-seq'=>$align{$name},
798 '-id'=>$seqname,
799 '-start'=>$start,
800 '-end'=>$end,
801 '-type'=>'aligned'
804 $self->addSeq($seq);
806 $count++;
809 return $count;
813 =head2 read_mase
815 Title : read_mase
816 Usage : $ali->read_mase(\*INPUT)
819 Function : reads mase (seaview)
820 : formatted alignments
823 Returns :
824 Argument :
826 =cut
828 sub read_mase {
829 my $self = shift;
830 my $in = shift;
831 my $name;
832 my $start;
833 my $end;
834 my $seq;
835 my $add;
836 my $count = 0;
839 while( <$in> ) {
840 /^;/ && next;
841 if( /^(\S+)\/(\d+)-(\d+)/ ) {
842 $name = $1;
843 $start = $2;
844 $end = $3;
845 } else {
846 s/\s//g;
847 $name = $_;
848 $end = -1;
851 $seq = "";
853 while( <$in> ) {
854 /^;/ && last;
855 s/[^A-Za-z\.\-]//g;
856 $seq .= $_;
858 if( $end == -1) {
859 $start = 1;
860 $end = length($seq);
863 $add = new Bio::Seq('-seq'=>$seq,
864 '-id'=>$name,
865 '-start'=>$start,
866 '-end'=>$end,
867 '-type'=>'aligned');
871 $self->addSeq($add);
873 $count++;
876 return $count;
881 =head2 read_Pfam_file
883 Title : read_Pfam_file
884 Usage : $ali->read_Pfam_file("thisfile");
886 Function : opens a filename, reads
887 : a Pfam (mul) formatted alignment
891 Returns :
892 Argument :
894 =cut
896 sub read_Pfam_file {
897 my $self = shift;
898 my $file = shift;
899 my $out;
901 if( open(_TEMPFILE,$file) == 0 ) {
902 $self->throw("Could not open $file for reading Pfam style alignment");
903 return 0;
906 $out = read_Pfam($self,\*_TEMPFILE);
908 close(_TEMPFILE);
910 return $out;
914 =head2 read_Pfam
916 Title : read_Pfam
917 Usage : $ali->read_Pfam(\*INPUT)
920 Function : reads a Pfam formatted
921 : Alignment (Mul format).
922 : - this is the format used by Belvu
924 Returns :
925 Argument :
927 =cut
929 sub read_Pfam {
930 my $self = shift;
931 my $in = shift;
932 my $name;
933 my $start;
934 my $end;
935 my $seq;
936 my $add;
937 my $acc;
938 my %names;
939 my $count = 0;
941 while( <$in> ) {
942 chop;
943 /^\/\// && last;
944 if( !/^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) {
945 $self->throw("Found a bad line [$_] in the pfam format alignment");
946 next;
949 $name = $1;
950 $start = $2;
951 $end = $3;
952 $seq = $4;
955 # there may be an accession number at the end of the line
957 ($acc) = ($' =~ /\s*(\S+)\s*/);
958 $names{'acc'} = $acc;
960 $add = new Bio::Seq('-seq'=>$seq,
961 '-id'=>$name,
962 '-names'=>\%names,
963 '-start'=>$start,
964 '-end'=>$end,
965 '-type'=>'aligned');
967 $self->addSeq($add);
969 $count++;
972 return $count;
975 sub write_Pfam_link {
976 my $self = shift;
977 my $link = shift;
978 my $out = shift;
979 my $len = shift;
980 my $acc = shift;
981 my ($namestr,$linkstr,$seq,$name,$start,$end,$disname,$add,$place);
983 if( !defined $len ) {
984 $len = 22;
987 foreach $seq ( $self->eachSeq() ) {
988 $name = $seq->id();
989 $disname = $self->get_displayname($seq->get_nse());
991 $add = $len - length($disname);
992 $place = " " x $add;
995 # print $out "Going to eval \$linkstr = $link\n";
996 eval ("\$linkstr = \"$link\"");
997 if( defined $acc ) {
998 print $out sprintf("%s%s%s %s\n",$linkstr,$place,$seq->str(),$seq->names()->{'acc'});
999 } else {
1000 print $out sprintf("%s%s%s\n",$linkstr,$place,$seq->str());
1006 =head2 write_Pfam
1008 Title : write_Pfam
1009 Usage : $ali->write_Pfam(\*OUTPUT)
1012 Function : writes a Pfam/Mul formatted
1013 : file
1016 Returns :
1017 Argument :
1019 =cut
1021 sub write_Pfam {
1022 my $self = shift;
1023 my $out = shift;
1024 my $acc = shift;
1025 my ($namestr,$seq,$add);
1026 my ($maxn);
1028 $maxn = $self->maxdisplayname_length();
1030 foreach $seq ( $self->eachSeq() ) {
1031 $namestr = $self->get_displayname($seq->get_nse());
1032 $add = $maxn - length($namestr) + 2;
1033 $namestr .= " " x $add;
1034 if( defined $acc && $acc == 1) {
1035 print $out sprintf("%s %s %s\n",$namestr,$seq->str(),$seq->names()->{'acc'});
1036 } else {
1037 print $out sprintf("%s %s\n",$namestr,$seq->str());
1043 =head2 write_clustalw
1045 Title : write_clustalw
1046 Usage : $ali->write_clustalw
1049 Function : writes a clustalw formatted
1050 : (.aln) file
1053 Returns :
1054 Argument :
1056 =cut
1058 sub write_clustalw {
1059 my $self = shift;
1060 my $file = shift;
1061 my ($count,$length,$seq,@seq,$tempcount);
1063 print $file "CLUSTAL W(1.4) multiple sequence alignment\n\n\n";
1065 $length = $self->length_aln();
1066 $count = $tempcount = 0;
1067 @seq = $self->eachSeq();
1069 while( $count < $length ) {
1070 foreach $seq ( @seq ) {
1071 print $file sprintf("%-22s %s\n",$self->get_displayname($seq->get_nse()),substr($seq->str(),$count,50));
1073 print $file "\n\n";
1074 $count += 50;
1081 =head2 write_fasta
1083 Title : write_fasta
1084 Usage : $ali->write_fasta(\*OUTPUT)
1087 Function : writes a fasta formatted alignment
1089 Returns :
1090 Argument : reference-to-glob to file or filehandle object
1092 =cut
1094 sub write_fasta {
1095 my $self = shift;
1096 my $file = shift;
1097 my ($seq,$rseq,$name,$count,$length,$seqsub);
1099 foreach $rseq ( $self->eachSeq() ) {
1100 $name = $self->get_displayname($rseq->get_nse());
1101 $seq = $rseq->str();
1103 print $file ">$name\n";
1105 $count =0;
1106 $length = length($seq);
1107 while( ($count * 60 ) < $length ) {
1108 $seqsub = substr($seq,$count*60,60);
1109 print $file "$seqsub\n";
1110 $count++;
1117 sub set_displayname {
1118 my $self = shift;
1119 my $name = shift;
1120 my $disname = shift;
1122 # print "Setting $name to $disname\n";
1123 $self->{dis_name}->{$name} = $disname;
1126 sub get_displayname {
1127 my $self = shift;
1128 my $name = shift;
1130 if( defined $self->{dis_name}->{$name} ) {
1131 return $self->{dis_name}->{$name};
1132 } else {
1133 return $name;
1137 =head2 set_displayname_flat
1139 Title : set_displayname_flat
1140 Usage : $ali->set_displayname_flat()
1143 Function : Makes all the sequences be displayed
1144 : as just their name, not name/start-end
1147 Returns :
1148 Argument :
1150 =cut
1152 sub set_displayname_flat {
1153 my $self = shift;
1154 my ($nse,$seq);
1156 foreach $seq ( $self->eachSeq() ) {
1157 $nse = $seq->get_nse();
1158 $self->set_displayname($nse,$seq->id());
1162 =head2 set_displayname_normal
1164 Title : set_displayname_normal
1165 Usage : $ali->set_displayname_normal()
1168 Function : Makes all the sequences be displayed
1169 : as name/start-end
1172 Returns :
1173 Argument :
1175 =cut
1177 sub set_displayname_normal {
1178 my $self = shift;
1179 my ($nse,$seq);
1181 foreach $seq ( $self->eachSeq() ) {
1182 $nse = $seq->get_nse();
1183 $self->set_displayname($nse,sprintf("%s/%d-%d",$seq->id(),$seq->start(),$seq->end()));
1187 =head2 set_displayname_count
1189 Title : set_displayname_count
1190 Usage : $ali->set_displayname_count
1193 Function : sets the names to be name_#
1194 : where # is the number of times this
1195 : name has been used.
1197 Returns :
1198 Argument :
1200 =cut
1203 sub set_displayname_count {
1204 my $self= shift;
1205 my (@arr,$name,$seq,$count,$temp,$nse);
1207 foreach $seq ( $self->each_alphabetically() ) {
1208 $nse = $seq->get_nse();
1210 #name will be set when this is the second
1211 #time (or greater) is has been seen
1213 if( $name eq ($seq->id()) ) {
1214 $temp = sprintf("%s_%s",$name,$count);
1215 $self->set_displayname($nse,$temp);
1216 $count++;
1217 } else {
1218 $count = 1;
1219 $name = $seq->id();
1220 $temp = sprintf("%s_%s",$name,$count);
1221 $self->set_displayname($nse,$temp);
1222 $count++;
1228 =head2 each_alphabetically
1230 Title : each_alphabetically
1231 Usage : foreach $seq ( $ali->each_alphabetically() )
1234 Function : returns an array of sequence object sorted
1235 : alphabetically by name and then by start point
1237 : Does not change the order of the alignment
1238 Returns :
1239 Argument :
1241 =cut
1244 sub each_alphabetically {
1245 my $self = shift;
1246 my ($seq,$nse,@arr,%hash,$count);
1248 foreach $seq ( $self->eachSeq() ) {
1249 $nse = $seq->get_nse("-","-");
1250 $hash{$nse} = $seq;
1254 foreach $nse ( sort alpha_startend keys %hash) {
1255 push(@arr,$hash{$nse});
1258 return @arr;
1263 =head2 sort_alphabetically
1265 Title : sort_alphabetically
1266 Usage : $ali->sort_alphabetically
1269 Function : changes the order of the alignemnt
1270 : to alphabetical on name followed by
1271 : numerical by number
1273 Returns :
1274 Argument :
1276 =cut
1278 sub sort_alphabetically {
1279 my $self = shift;
1280 my ($seq,$nse,@arr,%hash,$count);
1283 foreach $seq ( $self->eachSeq() ) {
1284 $nse = $seq->get_nse("-","-");
1285 $hash{$nse} = $seq;
1288 $count = 0;
1290 %{$self->{'order'}} = (); # reset the hash;
1292 foreach $nse ( sort alpha_startend keys %hash) {
1293 $self->{'order'}->{$count} = $nse;
1295 $count++;
1300 =head2 map_chars
1302 Title : map_chars
1303 Usage : $ali->map_chars('\.','-')
1306 Function : does a s/$arg1/$arg2/ on
1307 : the sequences. Useful for
1308 : gap characters
1310 : Notice that the from (arg1) is interpretted
1311 : as a regex, so be careful about quoting meta
1312 : characters (eg $ali->map_chars('.','-') wont
1313 : do what you want)
1314 Returns :
1315 Argument :
1317 =cut
1320 sub map_chars {
1321 my $self = shift;
1322 my $from = shift;
1323 my $to = shift;
1324 my ($seq,$temp);
1326 foreach $seq ( $self->eachSeq() ) {
1327 $temp = $seq->str();
1328 $temp =~ s/$from/$to/g;
1329 $seq->setseq($temp);
1333 =head2 uppercase
1335 Title : uppercase()
1336 Usage : $ali->uppercase()
1339 Function : Sets all the sequences
1340 : to uppercase
1343 Returns :
1344 Argument :
1346 =cut
1348 sub uppercase {
1349 my $self = shift;
1350 my $seq;
1351 my $temp;
1353 foreach $seq ( $self->eachSeq() ) {
1354 $temp = $seq->str();
1355 $temp =~ tr/[a-z]/[A-Z]/;
1357 $seq->setseq($temp);
1362 sub alpha_startend {
1363 my ($aname,$astart,$bname,$bstart);
1364 ($aname,$astart) = split (/-/,$a);
1365 ($bname,$bstart) = split (/-/,$b);
1367 if( $aname eq $bname ) {
1368 return $astart <=> $bstart;
1370 else {
1371 return $aname cmp $bname;
1376 =head2 no_sequences
1378 Title : no_sequences
1379 Usage : $depth = $ali->no_sequences
1382 Function : number of sequence in the
1383 : sequence alignment
1386 Returns :
1387 Argument :
1389 =cut
1391 sub no_sequences {
1392 my $self = shift;
1394 return keys %{$self->{'seq'}};
1397 =head2 no_residues
1399 Title : no_residues
1400 Usage : $no = $ali->no_residues
1403 Function : number of residues in total
1404 : in the alignment
1407 Returns :
1408 Argument :
1410 =cut
1413 sub no_residues {
1414 my $self = shift;
1415 my $count = 0;
1416 my ($key,$temp);
1418 foreach $key ( keys %{$self->{'seq'}} ) {
1420 $temp = $self->{'seq'}->{$key}->str();
1423 $temp =~ s/[^A-Za-z]//g;
1425 $count += length($temp);
1428 return $count;
1433 =head2 purge
1435 Title : purge
1436 Usage : $aln->purge(0.7);
1437 Function: removes sequences above whatever %id
1438 Example :
1439 Returns : An array of the removed sequences
1440 Arguments
1442 This function will grind on large alignments. Beware!
1444 (perhaps not ideally implemented)
1446 =cut
1448 sub purge{
1449 my ($self,$perc) = @_;
1450 my (@seqs,$seq,%removed,$i,$j,$count,@one,@two,$seq2,$k,$res,$ratio,@ret);
1452 # $self->write_Pfam(\*STDOUT);
1454 @seqs = $self->eachSeq();
1456 #$self->write_Pfam(\*STDOUT);
1458 # foreach $seq ( @seqs ) {
1459 # printf("$seq %s %s\n",$seq->get_nse(),join(' ',$seq->dump()));
1462 for($i=0;$i< @seqs;$i++ ) {
1463 $seq = $seqs[$i];
1464 #printf "%s\n", $seq->out_fasta();
1465 #print "\n\nDone\n\n";
1467 # if it has already been removed, skip
1469 if( $removed{$seq->get_nse()} == 1 ) {
1470 next;
1473 # if not ... look at the other sequences
1475 # make the first array once
1477 @one = $seq->seq();
1478 for($j=$i+1;$j < @seqs;$j++) {
1479 $seq2 = $seqs[$j];
1480 if ( $removed{$seq2->get_nse()} == 1 ) {
1481 next;
1483 @two = $seq2->seq();
1484 $count = 0;
1485 $res = 0;
1486 for($k=0;$k<@one;$k++) {
1487 if( $one[$k] ne '.' && $one[$k] ne '-' && $one[$k] eq $two[$k]) {
1488 $count++;
1490 if( $one[$k] ne '.' && $one[$k] ne '-' && $two[$k] ne '.' && $two[$k] ne '-' ) {
1491 $res++;
1494 if( $res == 0 ) {
1495 $ratio = 0;
1496 } else {
1497 $ratio = $count/$res;
1500 if( $ratio > $perc ) {
1501 $removed{$seq2->get_nse()} = 1;
1502 $self->removeSeq($seq2);
1503 push(@ret,$seq2);
1504 } else {
1505 # could put a comment here!
1510 return @ret;
1515 =head2 percentage_identity
1517 Title : percentage_identity
1518 Usage : $id = $align->percentage_identity
1519 Function:
1520 The function uses a fast method to calculate the average percentage identity of the alignment
1521 Returns : The average percentage identity of the alignment
1522 Args : None
1524 =cut
1526 sub percentage_identity{
1527 my ($self,@args) = @_;
1529 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
1530 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
1532 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
1534 if (! $self->is_flush()) {
1535 $self->throw("All sequences in the alignment must be the same length");
1538 @seqs = $self->eachSeq();
1539 $len = $self->length_aln();
1541 # load the each hash with correct keys for existence checks
1542 for( my $index=0; $index < $len; $index++) {
1543 foreach my $letter (@alphabet) {
1544 $countHashes[$index]->{$letter} = 0;
1549 foreach my $seq (@seqs) {
1550 my @seqChars = $seq->ary();
1551 for( my $column=0; $column < @seqChars; $column++ ) {
1552 my $char = uc($seqChars[$column]);
1553 if (exists $countHashes[$column]->{$char}) {
1554 $countHashes[$column]->{$char}++;
1559 $total = 0;
1560 $divisor = 0;
1561 for(my $column =0; $column < $len; $column++) {
1562 my %hash = %{$countHashes[$column]};
1563 $subdivisor = 0;
1564 foreach my $res (keys %hash) {
1565 $total += $hash{$res}*($hash{$res} - 1);
1566 $subdivisor += $hash{$res};
1568 $divisor += $subdivisor * ($subdivisor - 1);
1570 return ($total / $divisor )*100.0;
1574 =head2 read_Prodom
1576 Title : read_Prodom
1577 Usage : $ali->read_Prodom( $file )
1578 Function: Reads in a Prodom format alignment
1579 Returns :
1580 Args : A filehandle glob or ref. to a filehandle object
1582 =cut
1584 sub read_Prodom{
1585 my $self = shift;
1586 my $file = shift;
1588 my ($acc, $fake_id, $start, $end, $seq, $add, %names);
1590 while (<$file>){
1591 if (/^AL\s+(\S+)\|(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)$/){
1592 $acc=$1;
1593 $fake_id=$2; # Accessions have _species appended
1594 $start=$3;
1595 $end=$4;
1596 $seq=$5;
1598 $names{'fake_id'} = $fake_id;
1600 $add = new Bio::Seq('-seq'=>$seq,
1601 '-id'=>$acc,
1602 '-names'=>\%names,
1603 '-start'=>$start,
1604 '-end'=>$end,
1605 '-type'=>'aligned');
1607 $self->addSeq($add);