3 # BioPerl module for Bio::AlignIO::phylip
5 # Copyright Heikki Lehvaslaiho
10 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
14 # Do not use this module directly. Use it via the Bio::AlignIO class.
18 #you can set the name length to something other than the default 10
19 #if you use a version of phylip (hacked) that accepts ids > 10
20 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
23 # convert data from one format to another
24 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
25 -file => 't/data/cysprot1a.msf');
27 while( my $aln = $gcgstream->next_aln ) {
28 $phylipstream->write_aln($aln);
31 # do it again with phylip sequential format format
32 $phylipstream->interleaved(0);
33 # can also initialize the object like this
34 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
38 $gcgstream = Bio::AlignIO->new(-format => 'msf',
39 -file => 't/data/cysprot1a.msf');
41 while( my $aln = $gcgstream->next_aln ) {
42 $phylipstream->write_aln($aln);
47 This object can transform Bio::SimpleAlign objects to and from PHYLIP
48 fotmat. By deafult it works with the interleaved format. By specifying
49 the flag -interleaved =E<gt> 0 in the initialization the module can
50 read or write data in sequential format.
52 Long IDs up to 50 characters are supported by flag -longid =E<gt>
53 1. ID strings can be surrounded by single quoted. They are mandatory
54 only if the IDs contain spaces.
60 Please direct usage questions or support issues to the mailing list:
62 I<bioperl-l@bioperl.org>
64 rather than to the module maintainer directly. Many experienced and
65 reponsive experts will be able look at the problem and quickly
66 address it. Please include a thorough description of the problem
67 with code and data examples if at all possible.
71 Report bugs to the Bioperl bug tracking system to help us keep track
72 the bugs and their resolution. Bug reports can be submitted via the
75 http://bugzilla.open-bio.org/
77 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
79 Email: heikki at ebi.ac.uk
80 Email: jason at bioperl.org
84 The rest of the documentation details each of the object
85 methods. Internal methods are usually preceded with a _
89 # Let the code begin...
91 package Bio
::AlignIO
::phylip
;
92 use vars
qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
96 use POSIX; # for the rounding call
98 use base qw(Bio::AlignIO);
101 $DEFAULTIDLENGTH = 10;
102 $DEFAULTLINELEN = 60;
109 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
113 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
114 Returns : L<Bio::AlignIO> object
115 Args : [specific for writing of phylip format files]
116 -idlength => integer - length of the id (will pad w/
118 -interleaved => boolean - whether interleaved
119 or sequential format required
120 -line_length => integer of how long a sequence lines should be
121 -idlinebreak => insert a line break after the sequence id
122 so that sequence starts on the next line
123 -flag_SI => whether or not write a "S" or "I" just after
124 the num.seq. and line len., in the first line
125 -tag_length => integer of how long the tags have to be in
126 each line between the space separator. set it
127 to 0 to have 1 tag only.
128 -wrap_sequential => boolean for whether or not sequential
129 format should be broken up or a single line
130 default is false (single line)
131 -longid => boolean for allowing arbitrary long IDs (default is false)
136 my($self,@args) = @_;
137 $self->SUPER::_initialize
(@args);
139 my ($interleave,$linelen,$idlinebreak,
140 $idlength, $flag_SI, $tag_length,$ws, $longid) =
141 $self->_rearrange([qw(INTERLEAVED
149 $self->interleaved($interleave ?
1 : 0) if defined $interleave;
150 $self->idlength($idlength || $DEFAULTIDLENGTH);
151 $self->id_linebreak(1) if( $idlinebreak );
152 $self->line_length($linelen) if defined $linelen && $linelen > 0;
153 $self->flag_SI(1) if ( $flag_SI );
154 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
155 $self->wrap_sequential($ws ?
1 : 0);
156 $self->longid($longid ?
1 : 0);
163 Usage : $aln = $stream->next_aln()
164 Function: returns the next alignment in the stream.
165 Throws an exception if trying to read in PHYLIP
167 Returns : L<Bio::SimpleAlign> object
175 my ($seqcount, $residuecount, %hash, $name,$str,
176 @names,$seqname,$start,$end,$count,$seq);
178 my $aln = Bio
::SimpleAlign
->new(-source
=> 'phylip');
180 # skip blank lines until we see header line
181 # if we see a non-blank line that isn't the seqcount and residuecount line
182 # then bail out of next_aln (return)
183 HEADER
: while ($entry = $self->_readline) {
184 next if $entry =~ /^\s?$/;
185 if ($entry =~ /\s*(\d+)\s+(\d+)/) {
186 ($seqcount, $residuecount) = ($1, $2);
191 return unless $seqcount and $residuecount;
193 # first alignment section
194 my $idlen = $self->idlength;
197 my $interleaved = $self->interleaved;
198 while( $entry = $self->_readline) {
199 last if( $entry =~ /^\s?$/ && $interleaved );
201 # we've hit the next entry.
202 if( $entry =~ /^\s+(\d+)\s+(\d+)\s*$/) {
203 $self->_pushback($entry);
206 if( $self->longid && $entry =~ /\w/ ) {
208 $entry =~ /^\s*'([^']+)'\s+(.+)$/;
212 $entry =~ /^\s*([^\s]+)\s+(.+)$/;
216 # $name =~ s/[\s\/]/_/g; # not sure how wise is it to do this
217 $name =~ s/_+$//; # remove any trailing _'s
221 $count = scalar @names;
222 $hash{$count} = $str;
224 } elsif( $entry =~ /^\s+(.+)$/ ) {
228 $count = scalar @names;
229 $hash{$count} .= $str;
230 } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ||
231 $entry =~ /^(.{$idlen})(\S{$idlen}\s+.+)\s$/ # Handle weirdnes s when id is too long
235 $name =~ s/[\s\/]/_
/g
;
236 $name =~ s/_+$//; # remove any trailing _'s
240 $count = scalar @names;
241 $hash{$count} = $str;
242 } elsif( $interleaved ) {
243 if( $entry =~ /^(\S+)\s+(.+)/ ||
244 $entry =~ /^(.{$idlen})(.*)\s$/ ) {
247 $name =~ s/[\s\/]/_
/g
;
248 $name =~ s/_+$//; # remove any trailing _'s
251 $count = scalar @names;
252 $hash{$count} = $str;
254 $self->debug("unmatched line: $entry");
257 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount;
261 # interleaved sections
263 while( $entry = $self->_readline) {
264 # finish current entry
265 if($entry =~/\s*\d+\s+\d+/){
266 $self->_pushback($entry);
269 $count = 0, next if $entry =~ /^\s$/;
270 $entry =~ /\s*(.*)$/ && do {
274 $hash{$count} .= $str;
276 $self->throw("Not a valid interleaved PHYLIP file! [$count,$seqcount] ($entry)") if $count > $seqcount;
279 return if scalar @names < 1;
283 foreach $name ( @names ) {
285 if( $name =~ /(\S+)\/(\d
+)-(\d
+)/ ) {
292 $str = $hash{$count};
293 $str =~ s/[^A-Za-z]//g;
297 $self->throw("Length of sequence [$seqname] is not [$residuecount] it is ".CORE
::length($hash{$count})."! ")
298 unless CORE
::length($hash{$count}) == $residuecount;
300 $seq = Bio
::LocatableSeq
->new('-seq' => $hash{$count},
301 '-display_id' => $seqname,
304 '-alphabet' => $self->alphabet,
309 return $aln if $aln->num_sequences;
317 Usage : $stream->write_aln(@aln)
318 Function: writes the $aln object into the stream in phylip format
319 Returns : 1 for success and 0 for error
320 Args : L<Bio::Align::AlignI> object
325 my ($self,@aln) = @_;
329 my $width = $self->line_length();
330 my ($length,$date,$name,$seq,$miss,$pad,
331 %hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length);
333 foreach my $aln (@aln) {
334 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
335 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
338 $self->throw("All sequences in the alignment must be the same length")
339 unless $aln->is_flush(1) ;
341 $flag_SI = $self->flag_SI();
342 $aln->set_displayname_flat(); # plain
343 $length = $aln->length();
345 if ($self->interleaved() ) {
346 $self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length));
348 $self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length));
351 $self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length));
354 $idlength = $self->idlength();
355 $line_length = $self->line_length();
356 $tag_length = $self->tag_length();
357 foreach $seq ( $aln->each_seq() ) {
358 $name = $aln->displayname($seq->get_nse);
360 $self->warn("The lenght of the name is over 50 chars long [$name]")
361 if length($name) > 50;
364 $name = substr($name, 0, $idlength) if length($name) > $idlength;
365 $name = sprintf("%-".$idlength."s",$name);
366 if( $self->interleaved() ) {
368 } elsif( $self->id_linebreak) {
372 #phylip needs dashes not dots
373 my $seq = $seq->seq();
379 if( $self->interleaved() ) {
381 if ($tag_length <= $line_length) {
382 $numtags = floor
($line_length/$tag_length);
383 $line_length = $tag_length*$numtags;
387 while( $count < $length ) {
389 # there is another block to go!
390 foreach $name ( @arr ) {
391 my $dispname = $name;
392 $dispname = '' if $wrapped;
393 $self->_print (sprintf("%".($idlength+3)."s",$dispname));
396 $self->debug("residue count: $count\n") if ($count%100000 == 0);
397 while( ($tempcount + $tag_length < $length) &&
398 ($index < $numtags) ) {
399 $self->_print (sprintf("%s ",substr($hash{$name},
402 $tempcount += $tag_length;
406 if( $index < $numtags) {
408 $self->_print (sprintf("%s ",substr($hash{$name},
410 $tempcount += $tag_length;
412 $self->_print ("\n");
414 $self->_print ("\n");
419 foreach $name ( @arr ) {
420 my $dispname = $name;
421 my $line = sprintf("%s%s\n",$dispname,$hash{$name});
422 if( $self->wrap_sequential ) {
423 $line =~ s/(.{1,$width})/$1\n/g;
425 $self->_print ($line);
429 $self->flush if $self->_flush_on_write && defined $self->_fh;
436 Usage : my $interleaved = $obj->interleaved
437 Function: Get/Set Interleaved status
445 my ($self,$value) = @_;
446 if( defined $value ) {
447 if ($value) {$self->{'_interleaved'} = 1 }
448 else {$self->{'_interleaved'} = 0 }
450 return 1 unless defined $self->{'_interleaved'};
451 return $self->{'_interleaved'};
457 Usage : my $flag = $obj->flag_SI
458 Function: Get/Set if the Sequential/Interleaved flag has to be shown
459 after the number of sequences and sequence length
468 my ($self,$value) = @_;
469 my $previous = $self->{'_flag_SI'};
470 if( defined $value ) {
471 $self->{'_flag_SI'} = $value;
479 Usage : my $idlength = $obj->idlength
480 Function: Get/Set value of id length
488 my($self,$value) = @_;
490 $self->{'_idlength'} = $value;
492 return $self->{'_idlength'};
498 Usage : $obj->line_length($newval)
500 Returns : value of line_length
501 Args : newvalue (optional)
507 my ($self,$value) = @_;
508 if( defined $value) {
509 $self->{'_line_length'} = $value;
511 return $self->{'_line_length'} || $DEFAULTLINELEN;
518 Usage : $obj->tag_length($newval)
520 Example : my $tag_length = $obj->tag_length
521 Returns : value of the length for each space-separated tag in a line
522 Args : newvalue (optional) - set to zero to have one tag per line
528 my ($self,$value) = @_;
529 if( defined $value) {
530 $self->{'_tag_length'} = $value;
532 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
539 Usage : $obj->id_linebreak($newval)
541 Returns : value of id_linebreak
542 Args : newvalue (optional)
548 my ($self,$value) = @_;
549 if( defined $value) {
550 $self->{'_id_linebreak'} = $value;
552 return $self->{'_id_linebreak'} || 0;
556 =head2 wrap_sequential
558 Title : wrap_sequential
559 Usage : $obj->wrap_sequential($newval)
561 Returns : value of wrap_sequential
562 Args : newvalue (optional)
568 my ($self,$value) = @_;
569 if( defined $value) {
570 $self->{'_wrap_sequential'} = $value;
572 return $self->{'_wrap_sequential'} || 0;
578 Usage : $obj->longid($newval)
580 Returns : value of longid
581 Args : newvalue (optional)
587 my ($self,$value) = @_;
588 if( defined $value) {
589 $self->{'_longid'} = $value;
591 return $self->{'_longid'} || 0;