3 # BioPerl module for SimpleAlign
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 SimpleAlign - Multiple alignments held as a set of sequences
19 $aln = new Bio::SimpleAlign;
21 $aln->read_MSF(\*STDIN);
23 $aln->write_fasta(\*STDOUT);
27 This module is included with the central Bioperl distribution:
29 http://bio.perl.org/Core/Latest
30 ftp://bio.perl.org/pub/DIST
32 Follow the installation instructions included in the README file.
36 SimpleAlign handles multiple alignments of sequences. It is very permissive
37 of types (it wont insist on things being all same length etc): really
38 it is a SequenceSet explicitly held in memory with a whole series of
39 built in manipulations and especially file format systems for
40 read/writing alignments.
42 SimpleAlign basically views an alignment as an immutable block of text.
43 SimpleAlign *is not* the object to be using if you want to manipulate an
44 alignment (eg, truncate an alignment or remove columns that are all gaps).
45 These functions are much better done by UnivAln by Georg Fuellen.
47 However for lightweight display/formatting - this is the one to use.
49 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
50 the alignment, and this is the key for the internal hashes.
51 (name,start,end is abreviated nse in the code). However, in many cases
52 people don't want the name/start-end to be displayed: either multiple
53 names in an alignment or names specific to the alignment
54 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
55 'displayname', and generally is what is used to print out the
56 alignment. They default to name/start-end
58 The SimpleAlign Module came from Ewan Birney's Align module
62 SimpleAlign is being slowly converted to bioperl coding standards,
67 =item Use Bio::Root::Object - done
69 =item Use proper exceptions - done
71 =item Use hashed constructor - not done!
79 User feedback is an integral part of the evolution of this and other Bioperl modules.
80 Send your comments and suggestions preferably to one of the Bioperl mailing lists.
81 Your participation is much appreciated.
83 vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
84 vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
85 http://bio.perl.org/MailList.html - About the mailing lists
89 Report bugs to the Bioperl bug tracking system to help us keep track the bugs and
90 their resolution. Bug reports can be submitted via email or the web:
92 bioperl-bugs@bio.perl.org
93 http://bio.perl.org/bioperl-bugs/
97 Ewan Birney, birney@sanger.ac.uk
101 Bio::Seq.pm - The biosequence object
103 http://bio.perl.org/Projects/modules.html - Online module documentation
104 http://bio.perl.org/Projects/SeqAlign/ - Bioperl sequence alignment project
105 http://bio.perl.org/ - Bioperl Project Homepage
109 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
114 # Let the code begin...
117 package Bio
::SimpleAlign
;
118 use vars
qw($AUTOLOAD @ISA);
121 # Object preamble - inheriets from Bio::Root::Object
123 use Bio::Root::Object;
124 use Bio::Seq; # uses Seq's as list
127 @ISA = qw(Bio::Root::Object Exporter);
128 # new() is inherited from Bio::Root::Object
130 # _initialize is where the heavy stuff will happen when new is called
133 my($self,@args) = @_;
135 my $make = $self->SUPER::_initialize
;
137 # we need to set up internal hashs first!
140 $self->{'order'} = {};
141 $self->{'dis_name'} = {};
142 $self->{'id'} = 'NoName';
144 # maybe we should automatically read in from args. Hmmm...
146 # set stuff in self from @args
147 return $make; # success - we hope!
156 Usage : $myalign->id("Ig")
157 Function : Gets/sets the id field of the alignment
159 Returns : An id string
160 Argument : An id string (optional)
165 my ($self, $name) = @_;
167 if (defined( $name )) {
168 $self->{'id'} = $name;
171 return $self->{'id'};
180 Usage : $myalign->addSeq($newseq);
183 Function : Adds another sequence to the alignment
184 : *doesn't* align it - just adds it to the
197 my ($name,$id,$start,$end);
200 $start = $seq->start();
204 if( !defined $order ) {
205 $order = keys %{$self->{'seq'}};
208 $name = sprintf("%s-%d-%d",$id,$start,$end);
210 if( $self->{'seq'}->{$name} ) {
211 $self->warn("Replacing one sequence [$name]\n");
214 $self->{'seq'}->{$name} = $seq;
215 $self->{'order'}->{$order} = $name;
223 Usage : $aln->removeSeq($seq);
224 Function : removes a single sequence from an alignment
233 my ($name,$id,$start,$end);
235 $seq->out_fasta(\
*STDOUT
);
237 $start = $seq->start();
239 $name = sprintf("%s-%d-%d",$id,$start,$end);
241 if( !exists $self->{'seq'}->{$name} ) {
242 $self->throw("Sequence $name does not exist in the alignment to remove!");
245 delete $self->{'seq'}->{$name};
247 # we can't do anything about the order hash but that is ok
248 # because eachSeq will handle it
255 Usage : foreach $seq ( $align->eachSeq() )
258 Function : gets an array of Seq objects from the
271 foreach $order ( sort { $a <=> $b } keys %{$self->{'order'}} ) {
272 if( exists $self->{'seq'}->{$self->{'order'}->{$order}} ) {
273 #print STDOUT sprintf("Putting %s\n", $self->{'order'}->{$order});
274 push(@arr,$self->{'seq'}->{$self->{'order'}->{$order}});
283 =head2 consensus_string
285 Title : consensus_string
286 Usage : $str = $ali->consensus_string()
289 Function : Makes a consensus
298 sub consensus_string
{
305 $len = $self->length_aln();
307 foreach $count ( 0 .. $len ) {
308 $out .= $self->consensus_aa($count);
318 my ($seq,%hash,$count,$letter,$key);
321 foreach $seq ( $self->eachSeq() ) {
322 $letter = substr($seq->{'seq'},$point,1);
323 ($letter =~ /\./) && next;
324 # print "Looking at $letter\n";
331 foreach $key ( keys %hash ) {
332 # print "Now at $key $hash{$key}\n";
333 if( $hash{$key} > $count ) {
335 $count = $hash{$key};
347 Usage : $al->read_MSF(\*STDIN);
348 Function: reads MSF formatted files. Tries to read *all* MSF
349 It reads all non whitespace characters in the alignment
350 area. For MSFs with weird gaps (eg ~~~) map them by using
351 $al->map_chars('~','-');
361 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
363 # read in the name section
366 /\/\
// && last; # move to alignment section
367 /Name:\s+(\S+)/ && do { $name = $1;
368 $hash{$name} = ""; # blank line
369 push(@names,$name); # we need it ordered!
377 /^\s+(\S+)\s+(.*)$/ && do {
380 if( ! exists $hash{$name} ) {
381 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
384 $hash{$name} .= $str;
388 # now got this as a name - sequence hash. Lets make some sequences!
392 foreach $name ( @names ) {
393 if( $name =~ /(\S+)\/(\d
+)-(\d
+)/ ) {
401 $str =~ s/[^A-Za-z]//g;
405 $seq = new Bio
::Seq
('-seq'=>$hash{$name},
422 Usage : $ali->write_MSF(\*FH)
425 Function : writes MSF format output
441 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
443 $date = localtime(time);
446 $maxname = $self->maxnse_length();
447 $length = $self->length_aln();
449 if( !defined $name ) {
453 print $file sprintf("\n%s MSF: %d Type: P %s Check: 00 ..\n\n",$name,$self->no_sequences,$date);
455 foreach $seq ( $self->eachSeq() ) {
456 $name = $self->get_displayname($seq->get_nse());
457 $miss = $maxname - length ($name);
461 print $file sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->str(),$seq->GCG_checksum());
463 $hash{$name} = $seq->str();
468 # ok - heavy handed, but there you go.
472 while( $count < $length ) {
474 # there is another block to go!
476 foreach $name ( @arr ) {
477 print $file sprintf("%20s ",$name);
481 while( ($tempcount + 10 < $length) && ($index < 5) ) {
483 print $file sprintf("%s ",substr($hash{$name},$tempcount,10));
490 # ok, could be the very last guy ;)
499 print $file sprintf("%s ",substr($hash{$name},$tempcount));
504 } # end of each sequence
518 foreach $seq ( $self->eachSeq() ) {
519 $len = length $seq->id();
521 if( $len > $maxname ) {
534 foreach $seq ( $self->eachSeq() ) {
535 $len = length $seq->get_nse();
537 if( $len > $maxname ) {
545 sub maxdisplayname_length
{
550 foreach $seq ( $self->eachSeq() ) {
551 $len = length $self->get_displayname($seq->get_nse());
553 if( $len > $maxname ) {
565 Usage : $len = $ali->length_aln()
568 Function : returns the maximum length of the alignment.
569 : To be sure the alignment is a block, use is_flush
583 foreach $seq ( $self->eachSeq() ) {
584 $temp = length($seq->str());
585 if( $temp > $length ) {
597 Usage : if( $ali->is_flush() )
600 Function : Tells you whether the alignment
601 : is flush, ie all of the same length
615 foreach $seq ( $self->eachSeq() ) {
616 if( $length == (-1) ) {
617 $length = length($seq->str());
621 $temp = length($seq->str());
623 if( $temp != $length ) {
635 Usage : $ali->read_fasta(\*INPUT)
638 Function : reads in a fasta formatted
639 : file for an alignment
651 my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align);
656 if( defined $name ) {
657 # put away last name and sequence
659 if( $name =~ /(\S+)\/(\d
+)-(\d
+)/ ) {
666 $end = length($align{$name});
669 $seq = new Bio
::Seq
('-seq'=>$seqchar,
687 # put away last name and sequence
689 if( $name =~ /(\S+)\/(\d
+)-(\d
+)/ ) {
696 $end = length($align{$name});
699 $seq = new Bio
::Seq
('-seq'=>$seqchar,
716 Usage : $ali->read_selex(\*INPUT)
719 Function : reads selex (hmmer) format
731 my ($start,$end,%align,$name,$seqname,$seq,$count,%hash,%c2name,$no);
733 # not quite sure 'what' selex format is, but here we go!
747 warn("Exhausted file without reading selex format");
750 # first line starting with word
755 if( !/^(\S+)\s+([A-Za-z\.\-]+)\s*/ ) {
756 warn("Ok. I think [$_] does not look like selex format. Not sure...");
763 $align{$name} .= $seq;
764 $c2name{$count} = $name;
768 !/^([^\#]\S+)\s+([A-Za-z\.\-]+)\s*/ && next;
773 if( ! defined $align{$name} ) {
774 $c2name{$count} = $name;
778 $align{$name} .= $seq;
781 # ok... now we can make the sequences
784 foreach $no ( sort { $a <=> $b } keys %c2name ) {
785 $name = $c2name{$no};
787 if( $name =~ /(\S+)\/(\d
+)-(\d
+)/ ) {
794 $end = length($align{$name});
797 $seq = new Bio
::Seq
('-seq'=>$align{$name},
816 Usage : $ali->read_mase(\*INPUT)
819 Function : reads mase (seaview)
820 : formatted alignments
841 if( /^(\S+)\/(\d
+)-(\d
+)/ ) {
863 $add = new Bio
::Seq
('-seq'=>$seq,
881 =head2 read_Pfam_file
883 Title : read_Pfam_file
884 Usage : $ali->read_Pfam_file("thisfile");
886 Function : opens a filename, reads
887 : a Pfam (mul) formatted alignment
901 if( open(_TEMPFILE
,$file) == 0 ) {
902 $self->throw("Could not open $file for reading Pfam style alignment");
906 $out = read_Pfam
($self,\
*_TEMPFILE
);
917 Usage : $ali->read_Pfam(\*INPUT)
920 Function : reads a Pfam formatted
921 : Alignment (Mul format).
922 : - this is the format used by Belvu
944 if( !/^(\S+)\/(\d
+)-(\d
+)\s
+(\S
+)\s
*/ ) {
945 $self->throw("Found a bad line [$_] in the pfam format alignment");
955 # there may be an accession number at the end of the line
957 ($acc) = ($' =~ /\s*(\S+)\s*/);
958 $names{'acc
'} = $acc;
960 $add = new Bio::Seq('-seq
'=>$seq,
975 sub write_Pfam_link {
981 my ($namestr,$linkstr,$seq,$name,$start,$end,$disname,$add,$place);
983 if( !defined $len ) {
987 foreach $seq ( $self->eachSeq() ) {
989 $disname = $self->get_displayname($seq->get_nse());
991 $add = $len - length($disname);
995 # print $out "Going to eval \$linkstr = $link\n";
996 eval ("\$linkstr = \"$link\"");
998 print $out sprintf("%s%s%s %s\n",$linkstr,$place,$seq->str(),$seq->names()->{'acc
'});
1000 print $out sprintf("%s%s%s\n",$linkstr,$place,$seq->str());
1009 Usage : $ali->write_Pfam(\*OUTPUT)
1012 Function : writes a Pfam/Mul formatted
1025 my ($namestr,$seq,$add);
1028 $maxn = $self->maxdisplayname_length();
1030 foreach $seq ( $self->eachSeq() ) {
1031 $namestr = $self->get_displayname($seq->get_nse());
1032 $add = $maxn - length($namestr) + 2;
1033 $namestr .= " " x $add;
1034 if( defined $acc && $acc == 1) {
1035 print $out sprintf("%s %s %s\n",$namestr,$seq->str(),$seq->names()->{'acc
'});
1037 print $out sprintf("%s %s\n",$namestr,$seq->str());
1043 =head2 write_clustalw
1045 Title : write_clustalw
1046 Usage : $ali->write_clustalw
1049 Function : writes a clustalw formatted
1058 sub write_clustalw {
1061 my ($count,$length,$seq,@seq,$tempcount);
1063 print $file "CLUSTAL W(1.4) multiple sequence alignment\n\n\n";
1065 $length = $self->length_aln();
1066 $count = $tempcount = 0;
1067 @seq = $self->eachSeq();
1069 while( $count < $length ) {
1070 foreach $seq ( @seq ) {
1071 print $file sprintf("%-22s %s\n",$self->get_displayname($seq->get_nse()),substr($seq->str(),$count,50));
1084 Usage : $ali->write_fasta(\*OUTPUT)
1087 Function : writes a fasta formatted alignment
1090 Argument : reference-to-glob to file or filehandle object
1097 my ($seq,$rseq,$name,$count,$length,$seqsub);
1099 foreach $rseq ( $self->eachSeq() ) {
1100 $name = $self->get_displayname($rseq->get_nse());
1101 $seq = $rseq->str();
1103 print $file ">$name\n";
1106 $length = length($seq);
1107 while( ($count * 60 ) < $length ) {
1108 $seqsub = substr($seq,$count*60,60);
1109 print $file "$seqsub\n";
1117 sub set_displayname {
1120 my $disname = shift;
1122 # print "Setting $name to $disname\n";
1123 $self->{dis_name}->{$name} = $disname;
1126 sub get_displayname {
1130 if( defined $self->{dis_name}->{$name} ) {
1131 return $self->{dis_name}->{$name};
1137 =head2 set_displayname_flat
1139 Title : set_displayname_flat
1140 Usage : $ali->set_displayname_flat()
1143 Function : Makes all the sequences be displayed
1144 : as just their name, not name/start-end
1152 sub set_displayname_flat {
1156 foreach $seq ( $self->eachSeq() ) {
1157 $nse = $seq->get_nse();
1158 $self->set_displayname($nse,$seq->id());
1162 =head2 set_displayname_normal
1164 Title : set_displayname_normal
1165 Usage : $ali->set_displayname_normal()
1168 Function : Makes all the sequences be displayed
1177 sub set_displayname_normal {
1181 foreach $seq ( $self->eachSeq() ) {
1182 $nse = $seq->get_nse();
1183 $self->set_displayname($nse,sprintf("%s/%d-%d",$seq->id(),$seq->start(),$seq->end()));
1187 =head2 set_displayname_count
1189 Title : set_displayname_count
1190 Usage : $ali->set_displayname_count
1193 Function : sets the names to be name_#
1194 : where # is the number of times this
1195 : name has been used.
1203 sub set_displayname_count {
1205 my (@arr,$name,$seq,$count,$temp,$nse);
1207 foreach $seq ( $self->each_alphabetically() ) {
1208 $nse = $seq->get_nse();
1210 #name will be set when this is the second
1211 #time (or greater) is has been seen
1213 if( $name eq ($seq->id()) ) {
1214 $temp = sprintf("%s_%s",$name,$count);
1215 $self->set_displayname($nse,$temp);
1220 $temp = sprintf("%s_%s",$name,$count);
1221 $self->set_displayname($nse,$temp);
1228 =head2 each_alphabetically
1230 Title : each_alphabetically
1231 Usage : foreach $seq ( $ali->each_alphabetically() )
1234 Function : returns an array of sequence object sorted
1235 : alphabetically by name and then by start point
1237 : Does not change the order of the alignment
1244 sub each_alphabetically {
1246 my ($seq,$nse,@arr,%hash,$count);
1248 foreach $seq ( $self->eachSeq() ) {
1249 $nse = $seq->get_nse("-","-");
1254 foreach $nse ( sort alpha_startend keys %hash) {
1255 push(@arr,$hash{$nse});
1263 =head2 sort_alphabetically
1265 Title : sort_alphabetically
1266 Usage : $ali->sort_alphabetically
1269 Function : changes the order of the alignemnt
1270 : to alphabetical on name followed by
1271 : numerical by number
1278 sub sort_alphabetically {
1280 my ($seq,$nse,@arr,%hash,$count);
1283 foreach $seq ( $self->eachSeq() ) {
1284 $nse = $seq->get_nse("-","-");
1290 %{$self->{'order
'}} = (); # reset the hash;
1292 foreach $nse ( sort alpha_startend keys %hash) {
1293 $self->{'order
'}->{$count} = $nse;
1303 Usage : $ali->map_chars('\
.','-')
1306 Function : does a s/$arg1/$arg2/ on
1307 : the sequences. Useful for
1310 : Notice that the from (arg1) is interpretted
1311 : as a regex, so be careful about quoting meta
1312 : characters (eg $ali->map_chars('.','-') wont
1326 foreach $seq ( $self->eachSeq() ) {
1327 $temp = $seq->str();
1328 $temp =~ s/$from/$to/g;
1329 $seq->setseq($temp);
1336 Usage : $ali->uppercase()
1339 Function : Sets all the sequences
1353 foreach $seq ( $self->eachSeq() ) {
1354 $temp = $seq->str();
1355 $temp =~ tr/[a-z]/[A-Z]/;
1357 $seq->setseq($temp);
1362 sub alpha_startend {
1363 my ($aname,$astart,$bname,$bstart);
1364 ($aname,$astart) = split (/-/,$a);
1365 ($bname,$bstart) = split (/-/,$b);
1367 if( $aname eq $bname ) {
1368 return $astart <=> $bstart;
1371 return $aname cmp $bname;
1378 Title : no_sequences
1379 Usage : $depth = $ali->no_sequences
1382 Function : number of sequence in the
1383 : sequence alignment
1394 return keys %{$self->{'seq
'}};
1400 Usage : $no = $ali->no_residues
1403 Function : number of residues in total
1418 foreach $key ( keys %{$self->{'seq
'}} ) {
1420 $temp = $self->{'seq
'}->{$key}->str();
1423 $temp =~ s/[^A-Za-z]//g;
1425 $count += length($temp);
1436 Usage : $aln->purge(0.7);
1437 Function: removes sequences above whatever %id
1439 Returns : An array of the removed sequences
1442 This function will grind on large alignments. Beware!
1444 (perhaps not ideally implemented)
1449 my ($self,$perc) = @_;
1450 my (@seqs,$seq,%removed,$i,$j,$count,@one,@two,$seq2,$k,$res,$ratio,@ret);
1452 # $self->write_Pfam(\*STDOUT);
1454 @seqs = $self->eachSeq();
1456 #$self->write_Pfam(\*STDOUT);
1458 # foreach $seq ( @seqs ) {
1459 # printf("$seq %s %s\n",$seq->get_nse(),join(' ',$seq->dump()));
1462 for($i=0;$i< @seqs;$i++ ) {
1464 #printf "%s\n", $seq->out_fasta();
1465 #print "\n\nDone\n\n";
1467 # if it has already been removed, skip
1469 if( $removed{$seq->get_nse()} == 1 ) {
1473 # if not ... look at the other sequences
1475 # make the first array once
1478 for($j=$i+1;$j < @seqs;$j++) {
1480 if ( $removed{$seq2->get_nse()} == 1 ) {
1483 @two = $seq2->seq();
1486 for($k=0;$k<@one;$k++) {
1487 if( $one[$k] ne '.' && $one[$k] ne '-' && $one[$k] eq $two[$k]) {
1490 if( $one[$k] ne '.' && $one[$k] ne '-' && $two[$k] ne '.' && $two[$k] ne '-' ) {
1497 $ratio = $count/$res;
1500 if( $ratio > $perc ) {
1501 $removed{$seq2->get_nse()} = 1;
1502 $self->removeSeq($seq2);
1505 # could put a comment here!
1515 =head2 percentage_identity
1517 Title : percentage_identity
1518 Usage : $id = $align->percentage_identity
1520 The function uses a fast method to calculate the average percentage identity of the alignment
1521 Returns : The average percentage identity of the alignment
1526 sub percentage_identity{
1527 my ($self,@args) = @_;
1529 my @alphabet = ('A
','B
','C
','D
','E
','F
','G
','H
','I
','J
','K
','L
','M
',
1530 'N
','O
','P
','Q
','R
','S
','T
','U
','V
','W
','X
','Y
','Z
');
1532 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
1534 if (! $self->is_flush()) {
1535 $self->throw("All sequences in the alignment must be the same length");
1538 @seqs = $self->eachSeq();
1539 $len = $self->length_aln();
1541 # load the each hash with correct keys for existence checks
1542 for( my $index=0; $index < $len; $index++) {
1543 foreach my $letter (@alphabet) {
1544 $countHashes[$index]->{$letter} = 0;
1549 foreach my $seq (@seqs) {
1550 my @seqChars = $seq->ary();
1551 for( my $column=0; $column < @seqChars; $column++ ) {
1552 my $char = uc($seqChars[$column]);
1553 if (exists $countHashes[$column]->{$char}) {
1554 $countHashes[$column]->{$char}++;
1561 for(my $column =0; $column < $len; $column++) {
1562 my %hash = %{$countHashes[$column]};
1564 foreach my $res (keys %hash) {
1565 $total += $hash{$res}*($hash{$res} - 1);
1566 $subdivisor += $hash{$res};
1568 $divisor += $subdivisor * ($subdivisor - 1);
1570 return ($total / $divisor )*100.0;
1577 Usage : $ali->read_Prodom( $file )
1578 Function: Reads in a Prodom format alignment
1580 Args : A filehandle glob or ref. to a filehandle object
1588 my ($acc, $fake_id, $start, $end, $seq, $add, %names);
1591 if (/^AL\s+(\S+)\|(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)$/){
1593 $fake_id=$2; # Accessions have _species appended
1598 $names{'fake_id
'} = $fake_id;
1600 $add = new Bio::Seq('-seq
'=>$seq,
1605 '-type
'=>'aligned
');
1607 $self->addSeq($add);