2 # BioPerl module for Bio::AlignIO::phylip
4 # Copyright Heikki Lehvaslaiho
9 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
13 # Do not use this module directly. Use it via the Bio::AlignIO class.
17 #you can set the name length to something other than the default 10
18 #if you use a version of phylip (hacked) that accepts ids > 10
19 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
22 # convert data from one format to another
23 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
24 -file => 't/data/cysprot1a.msf');
26 while( my $aln = $gcgstream->next_aln ) {
27 $phylipstream->write_aln($aln);
30 # do it again with phylip sequential format format
31 $phylipstream->interleaved(0);
32 # can also initialize the object like this
33 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
37 $gcgstream = Bio::AlignIO->new(-format => 'msf',
38 -file => 't/data/cysprot1a.msf');
40 while( my $aln = $gcgstream->next_aln ) {
41 $phylipstream->write_aln($aln);
46 This object can transform Bio::SimpleAlign objects to and from PHYLIP
47 format. By default it works with the interleaved format. By specifying
48 the flag -interleaved =E<gt> 0 in the initialization the module can
49 read or write data in sequential format.
51 Long IDs up to 50 characters are supported by flag -longid =E<gt>
52 1. ID strings can be surrounded by single quoted. They are mandatory
53 only if the IDs contain spaces.
59 Please direct usage questions or support issues to the mailing list:
61 I<bioperl-l@bioperl.org>
63 rather than to the module maintainer directly. Many experienced and
64 reponsive experts will be able look at the problem and quickly
65 address it. Please include a thorough description of the problem
66 with code and data examples if at all possible.
70 Report bugs to the Bioperl bug tracking system to help us keep track
71 the bugs and their resolution. Bug reports can be submitted via the
74 https://github.com/bioperl/bioperl-live/issues
76 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
78 Email: heikki at ebi.ac.uk
79 Email: jason at bioperl.org
83 The rest of the documentation details each of the object
84 methods. Internal methods are usually preceded with a _
88 # Let the code begin...
90 package Bio
::AlignIO
::phylip
;
91 use vars
qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
95 use POSIX; # for the rounding call
97 use base qw(Bio::AlignIO);
100 $DEFAULTIDLENGTH = 10;
101 $DEFAULTLINELEN = 60;
108 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
112 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
113 Returns : L<Bio::AlignIO> object
114 Args : [specific for writing of phylip format files]
115 -idlength => integer - length of the id (will pad w/
117 -interleaved => boolean - whether interleaved
118 or sequential format required
119 -line_length => integer of how long a sequence lines should be
120 -idlinebreak => insert a line break after the sequence id
121 so that sequence starts on the next line
122 -flag_SI => whether or not write a "S" or "I" just after
123 the num.seq. and line len., in the first line
124 -tag_length => integer of how long the tags have to be in
125 each line between the space separator. set it
126 to 0 to have 1 tag only.
127 -wrap_sequential => boolean for whether or not sequential
128 format should be broken up or a single line
129 default is false (single line)
130 -longid => boolean for allowing arbitrary long IDs (default is false)
135 my($self,@args) = @_;
136 $self->SUPER::_initialize
(@args);
138 my ($interleave,$linelen,$idlinebreak,
139 $idlength, $flag_SI, $tag_length,$ws, $longid) =
140 $self->_rearrange([qw(INTERLEAVED
148 $self->interleaved($interleave ?
1 : 0) if defined $interleave;
149 $self->idlength($idlength || $DEFAULTIDLENGTH);
150 $self->id_linebreak(1) if( $idlinebreak );
151 $self->line_length($linelen) if defined $linelen && $linelen > 0;
152 $self->flag_SI(1) if ( $flag_SI );
153 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
154 $self->wrap_sequential($ws ?
1 : 0);
155 $self->longid($longid ?
1 : 0);
162 Usage : $aln = $stream->next_aln()
163 Function: returns the next alignment in the stream.
164 Throws an exception if trying to read in PHYLIP
166 Returns : L<Bio::SimpleAlign> object
174 my ($seqcount, $residuecount, %hash, $name,$str,
175 @names,$seqname,$start,$end,$count,$seq);
177 my $aln = Bio
::SimpleAlign
->new(-source
=> 'phylip');
179 # First, parse up through the header.
180 # If we see a non-blank line that isn't the seqcount and residuecount line
181 # then bail out of next_aln (return)
182 while ($entry = $self->_readline) {
183 if ($entry =~ /^\s?$/) {
185 } elsif ($entry =~ /\s*(\d+)\s+(\d+)/) {
186 ($seqcount, $residuecount) = ($1, $2);
189 $self->warn ("Failed to parse PHYLIP: Did not see a sequence count and residue count.");
193 return unless $seqcount and $residuecount;
195 # First alignment section. We expect to see a name and (part of) a sequence.
196 my $idlen = $self->idlength;
199 while ($entry = $self->_readline) {
200 if ($entry =~ /^\s?$/) { # eat the newlines
204 # Names can be in a few different formats:
205 # 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
206 # 2. they can be hacked with a long ID, as passed in with the flag -longid.
207 # 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
208 if ($self->longid()) { # 2 or 3
209 if ($entry =~ /^'(.+)'\s+(.+)$/) { # 3. name has single quotes.
212 } else { # 2. name does not have single quotes, so should not have spaces.
213 # therefore, the first part of the line is the name and the rest is the seq.
214 # make sure that the line does not lead with extra spaces.
216 ($name, $str) = split (/\s+/,$entry, 2);
218 } else { # 1. traditional phylip.
219 $entry =~ /^(.{10})\s+(.+)$/;
222 $name =~ s/\s+$//; # eat any trailing spaces
226 #clean sequence of spaces:
229 # are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
230 if (($self->interleaved) == 0) {
231 while (length($str) < $residuecount) {
232 $entry = $self->_readline;
235 if ($entry =~ /^\s*$/) { # we ran into a newline before we got a complete sequence: bail!
236 $self->warn("Failed to parse PHYLIP: Sequence $name was shorter than expected: " . length($str) . " instead of $residuecount.");
241 $hash{$count} = $str;
244 # if we've read as many seqs as we're supposed to, move on.
245 if ($count == $seqcount) {
250 # if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
251 if ($self->interleaved) {
252 while (length($hash{$seqcount-1}) < $residuecount) {
254 while ($entry = $self->_readline) {
255 if ($entry =~ /^\s*$/) { # eat newlines
256 if ($count != 0) { # there was a newline at an unexpected place!
257 $self->warn("Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount.");
261 } else { # start taking in chunks
263 $hash{$count} .= $entry;
266 if ($count >= $seqcount) { # we've read all of the sequences for this chunk, so move on.
272 if ((scalar @names) != $seqcount) {
273 $self->warn("Failed to parse PHYLIP: Did not see the correct number of seqs: saw " . scalar(@names) . ", expected $seqcount.");
276 for ($count=0; $count<$seqcount; $count++) {
277 $str = $hash{$count};
278 my $seqname = $names[$count];
279 if (length($str) != $residuecount) {
280 $self->warn("Failed to parse PHYLIP: Sequence $seqname was the wrong length: " . length($str) . " instead of $residuecount.");
282 $seq = Bio
::LocatableSeq
->new('-seq' => $hash{$count},
283 '-display_id' => $seqname);
292 Usage : $stream->write_aln(@aln)
293 Function: writes the $aln object into the stream in phylip format
294 Returns : 1 for success and 0 for error
295 Args : L<Bio::Align::AlignI> object
300 my ($self,@aln) = @_;
304 my $width = $self->line_length();
305 my ($length,$date,$name,$seq,$miss,$pad,
306 %hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length);
308 foreach my $aln (@aln) {
309 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
310 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
313 $self->throw("All sequences in the alignment must be the same length")
314 unless $aln->is_flush(1) ;
316 $flag_SI = $self->flag_SI();
317 $aln->set_displayname_flat(); # plain
318 $length = $aln->length();
320 if ($self->interleaved() ) {
321 $self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length));
323 $self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length));
326 $self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length));
329 $idlength = $self->idlength();
330 $line_length = $self->line_length();
331 $tag_length = $self->tag_length();
332 foreach $seq ( $aln->each_seq() ) {
333 $name = $aln->displayname($seq->get_nse);
335 $self->warn("The length of the name is over 50 chars long [$name]")
336 if length($name) > 50;
339 $name = substr($name, 0, $idlength) if length($name) > $idlength;
340 $name = sprintf("%-".$idlength."s",$name);
341 if( $self->interleaved() ) {
343 } elsif( $self->id_linebreak) {
347 #phylip needs dashes not dots
348 my $seq = $seq->seq();
354 if( $self->interleaved() ) {
356 if ($tag_length <= $line_length) {
357 $numtags = floor
($line_length/$tag_length);
358 $line_length = $tag_length*$numtags;
362 while( $count < $length ) {
364 # there is another block to go!
365 foreach $name ( @arr ) {
366 my $dispname = $name;
367 $dispname = '' if $wrapped;
368 $self->_print (sprintf("%".($idlength+3)."s",$dispname));
371 $self->debug("residue count: $count\n") if ($count%100000 == 0);
372 while( ($tempcount + $tag_length < $length) &&
373 ($index < $numtags) ) {
374 $self->_print (sprintf("%s ",substr($hash{$name},
377 $tempcount += $tag_length;
381 if( $index < $numtags) {
383 $self->_print (sprintf("%s",substr($hash{$name},
385 $tempcount += $tag_length;
387 $self->_print ("\n");
389 $self->_print ("\n");
394 foreach $name ( @arr ) {
395 my $dispname = $name;
396 my $line = sprintf("%s%s\n",$dispname,$hash{$name});
397 if( $self->wrap_sequential ) {
398 $line =~ s/(.{1,$width})/$1\n/g;
400 $self->_print ($line);
404 $self->flush if $self->_flush_on_write && defined $self->_fh;
411 Usage : my $interleaved = $obj->interleaved
412 Function: Get/Set Interleaved status
420 my ($self,$value) = @_;
421 if( defined $value ) {
422 if ($value) {$self->{'_interleaved'} = 1 }
423 else {$self->{'_interleaved'} = 0 }
425 return 1 unless defined $self->{'_interleaved'};
426 return $self->{'_interleaved'};
432 Usage : my $flag = $obj->flag_SI
433 Function: Get/Set if the Sequential/Interleaved flag has to be shown
434 after the number of sequences and sequence length
443 my ($self,$value) = @_;
444 my $previous = $self->{'_flag_SI'};
445 if( defined $value ) {
446 $self->{'_flag_SI'} = $value;
454 Usage : my $idlength = $obj->idlength
455 Function: Get/Set value of id length
463 my($self,$value) = @_;
465 $self->{'_idlength'} = $value;
467 return $self->{'_idlength'};
473 Usage : $obj->line_length($newval)
475 Returns : value of line_length
476 Args : newvalue (optional)
482 my ($self,$value) = @_;
483 if( defined $value) {
484 $self->{'_line_length'} = $value;
486 return $self->{'_line_length'} || $DEFAULTLINELEN;
493 Usage : $obj->tag_length($newval)
495 Example : my $tag_length = $obj->tag_length
496 Returns : value of the length for each space-separated tag in a line
497 Args : newvalue (optional) - set to zero to have one tag per line
503 my ($self,$value) = @_;
504 if( defined $value) {
505 $self->{'_tag_length'} = $value;
507 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
514 Usage : $obj->id_linebreak($newval)
516 Returns : value of id_linebreak
517 Args : newvalue (optional)
523 my ($self,$value) = @_;
524 if( defined $value) {
525 $self->{'_id_linebreak'} = $value;
527 return $self->{'_id_linebreak'} || 0;
531 =head2 wrap_sequential
533 Title : wrap_sequential
534 Usage : $obj->wrap_sequential($newval)
536 Returns : value of wrap_sequential
537 Args : newvalue (optional)
543 my ($self,$value) = @_;
544 if( defined $value) {
545 $self->{'_wrap_sequential'} = $value;
547 return $self->{'_wrap_sequential'} || 0;
553 Usage : $obj->longid($newval)
555 Returns : value of longid
556 Args : newvalue (optional)
562 my ($self,$value) = @_;
563 if( defined $value) {
564 $self->{'_longid'} = $value;
566 return $self->{'_longid'} || 0;