nexml.t: Added a missing XML::Twig requirement.
[bioperl-live.git] / Bio / AlignIO / phylip.pm
blob765538cb426421958f71549d7734f5bb73b26a7f
2 # BioPerl module for Bio::AlignIO::phylip
4 # Copyright Heikki Lehvaslaiho
7 =head1 NAME
9 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
11 =head1 SYNOPSIS
13 # Do not use this module directly. Use it via the Bio::AlignIO class.
15 use Bio::AlignIO;
16 use Bio::SimpleAlign;
17 #you can set the name length to something other than the default 10
18 #if you use a version of phylip (hacked) that accepts ids > 10
19 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
20 -fh => \*STDOUT,
21 -idlength=>30);
22 # convert data from one format to another
23 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
24 -file => 't/data/cysprot1a.msf');
26 while( my $aln = $gcgstream->next_aln ) {
27 $phylipstream->write_aln($aln);
30 # do it again with phylip sequential format format
31 $phylipstream->interleaved(0);
32 # can also initialize the object like this
33 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
34 -format => 'phylip',
35 -fh => \*STDOUT,
36 -idlength=>10);
37 $gcgstream = Bio::AlignIO->new(-format => 'msf',
38 -file => 't/data/cysprot1a.msf');
40 while( my $aln = $gcgstream->next_aln ) {
41 $phylipstream->write_aln($aln);
44 =head1 DESCRIPTION
46 This object can transform Bio::SimpleAlign objects to and from PHYLIP
47 format. By default it works with the interleaved format. By specifying
48 the flag -interleaved =E<gt> 0 in the initialization the module can
49 read or write data in sequential format.
51 Long IDs up to 50 characters are supported by flag -longid =E<gt>
52 1. ID strings can be surrounded by single quoted. They are mandatory
53 only if the IDs contain spaces.
55 =head1 FEEDBACK
57 =head2 Support
59 Please direct usage questions or support issues to the mailing list:
61 I<bioperl-l@bioperl.org>
63 rather than to the module maintainer directly. Many experienced and
64 reponsive experts will be able look at the problem and quickly
65 address it. Please include a thorough description of the problem
66 with code and data examples if at all possible.
68 =head2 Reporting Bugs
70 Report bugs to the Bioperl bug tracking system to help us keep track
71 the bugs and their resolution. Bug reports can be submitted via the
72 web:
74 https://github.com/bioperl/bioperl-live/issues
76 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
78 Email: heikki at ebi.ac.uk
79 Email: jason at bioperl.org
81 =head1 APPENDIX
83 The rest of the documentation details each of the object
84 methods. Internal methods are usually preceded with a _
86 =cut
88 # Let the code begin...
90 package Bio::AlignIO::phylip;
91 use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
92 use strict;
94 use Bio::SimpleAlign;
95 use POSIX; # for the rounding call
97 use base qw(Bio::AlignIO);
99 BEGIN {
100 $DEFAULTIDLENGTH = 10;
101 $DEFAULTLINELEN = 60;
102 $DEFAULTTAGLEN = 10;
105 =head2 new
107 Title : new
108 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
109 -file => '>file',
110 -idlength => 10,
111 -idlinebreak => 1);
112 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
113 Returns : L<Bio::AlignIO> object
114 Args : [specific for writing of phylip format files]
115 -idlength => integer - length of the id (will pad w/
116 spaces if needed)
117 -interleaved => boolean - whether interleaved
118 or sequential format required
119 -line_length => integer of how long a sequence lines should be
120 -idlinebreak => insert a line break after the sequence id
121 so that sequence starts on the next line
122 -flag_SI => whether or not write a "S" or "I" just after
123 the num.seq. and line len., in the first line
124 -tag_length => integer of how long the tags have to be in
125 each line between the space separator. set it
126 to 0 to have 1 tag only.
127 -wrap_sequential => boolean for whether or not sequential
128 format should be broken up or a single line
129 default is false (single line)
130 -longid => boolean for allowing arbitrary long IDs (default is false)
132 =cut
134 sub _initialize {
135 my($self,@args) = @_;
136 $self->SUPER::_initialize(@args);
138 my ($interleave,$linelen,$idlinebreak,
139 $idlength, $flag_SI, $tag_length,$ws, $longid) =
140 $self->_rearrange([qw(INTERLEAVED
141 LINE_LENGTH
142 IDLINEBREAK
143 IDLENGTH
144 FLAG_SI
145 TAG_LENGTH
146 WRAP_SEQUENTIAL
147 LONGID)],@args);
148 $self->interleaved($interleave ? 1 : 0) if defined $interleave;
149 $self->idlength($idlength || $DEFAULTIDLENGTH);
150 $self->id_linebreak(1) if( $idlinebreak );
151 $self->line_length($linelen) if defined $linelen && $linelen > 0;
152 $self->flag_SI(1) if ( $flag_SI );
153 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
154 $self->wrap_sequential($ws ? 1 : 0);
155 $self->longid($longid ? 1 : 0);
159 =head2 next_aln
161 Title : next_aln
162 Usage : $aln = $stream->next_aln()
163 Function: returns the next alignment in the stream.
164 Throws an exception if trying to read in PHYLIP
165 sequential format.
166 Returns : L<Bio::SimpleAlign> object
167 Args :
169 =cut
171 sub next_aln {
172 my $self = shift;
173 my $entry;
174 my ($seqcount, $residuecount, %hash, $name,$str,
175 @names,$seqname,$start,$end,$count,$seq);
177 my $aln = Bio::SimpleAlign->new(-source => 'phylip');
179 # First, parse up through the header.
180 # If we see a non-blank line that isn't the seqcount and residuecount line
181 # then bail out of next_aln (return)
182 while ($entry = $self->_readline) {
183 if ($entry =~ /^\s?$/) {
184 next;
185 } elsif ($entry =~ /\s*(\d+)\s+(\d+)/) {
186 ($seqcount, $residuecount) = ($1, $2);
187 last;
188 } else {
189 $self->warn ("Failed to parse PHYLIP: Did not see a sequence count and residue count.");
190 return;
193 return unless $seqcount and $residuecount;
195 # First alignment section. We expect to see a name and (part of) a sequence.
196 my $idlen = $self->idlength;
197 $count = 0;
199 while ($entry = $self->_readline) {
200 if ($entry =~ /^\s?$/) { # eat the newlines
201 next;
204 # Names can be in a few different formats:
205 # 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
206 # 2. they can be hacked with a long ID, as passed in with the flag -longid.
207 # 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
208 if ($self->longid()) { # 2 or 3
209 if ($entry =~ /^'(.+)'\s+(.+)$/) { # 3. name has single quotes.
210 $name = $1;
211 $str = $2;
212 } else { # 2. name does not have single quotes, so should not have spaces.
213 # therefore, the first part of the line is the name and the rest is the seq.
214 # make sure that the line does not lead with extra spaces.
215 $entry =~ s/^\s+//;
216 ($name, $str) = split (/\s+/,$entry, 2);
218 } else { # 1. traditional phylip.
219 $entry =~ /^(.{10})\s+(.+)$/;
220 $name = $1;
221 $str = $2;
222 $name =~ s/\s+$//; # eat any trailing spaces
223 $name =~ s/\s+/_/g;
225 push @names, $name;
226 #clean sequence of spaces:
227 $str =~ s/\s+//g;
229 # are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
230 if (($self->interleaved) == 0) {
231 while (length($str) < $residuecount) {
232 $entry = $self->_readline;
233 $str .= $entry;
234 $str =~ s/\s+//g;
235 if ($entry =~ /^\s*$/) { # we ran into a newline before we got a complete sequence: bail!
236 $self->warn("Failed to parse PHYLIP: Sequence $name was shorter than expected: " . length($str) . " instead of $residuecount.");
237 last;
241 $hash{$count} = $str;
243 $count++;
244 # if we've read as many seqs as we're supposed to, move on.
245 if ($count == $seqcount) {
246 last;
250 # if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
251 if ($self->interleaved) {
252 while (length($hash{$seqcount-1}) < $residuecount) {
253 $count = 0;
254 while ($entry = $self->_readline) {
255 if ($entry =~ /^\s*$/) { # eat newlines
256 if ($count != 0) { # there was a newline at an unexpected place!
257 $self->warn("Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount.");
258 return;
260 next;
261 } else { # start taking in chunks
262 $entry =~ s/\s//g;
263 $hash{$count} .= $entry;
264 $count++;
266 if ($count >= $seqcount) { # we've read all of the sequences for this chunk, so move on.
267 last;
272 if ((scalar @names) != $seqcount) {
273 $self->warn("Failed to parse PHYLIP: Did not see the correct number of seqs: saw " . scalar(@names) . ", expected $seqcount.");
274 return;
276 for ($count=0; $count<$seqcount; $count++) {
277 $str = $hash{$count};
278 my $seqname = $names[$count];
279 if (length($str) != $residuecount) {
280 $self->warn("Failed to parse PHYLIP: Sequence $seqname was the wrong length: " . length($str) . " instead of $residuecount.");
282 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
283 '-display_id' => $seqname);
284 $aln->add_seq($seq);
286 return $aln;
289 =head2 write_aln
291 Title : write_aln
292 Usage : $stream->write_aln(@aln)
293 Function: writes the $aln object into the stream in phylip format
294 Returns : 1 for success and 0 for error
295 Args : L<Bio::Align::AlignI> object
297 =cut
299 sub write_aln {
300 my ($self,@aln) = @_;
301 my $count = 0;
302 my $wrapped = 0;
303 my $maxname;
304 my $width = $self->line_length();
305 my ($length,$date,$name,$seq,$miss,$pad,
306 %hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length);
308 foreach my $aln (@aln) {
309 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
310 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
311 next;
313 $self->throw("All sequences in the alignment must be the same length")
314 unless $aln->is_flush(1) ;
316 $flag_SI = $self->flag_SI();
317 $aln->set_displayname_flat(); # plain
318 $length = $aln->length();
319 if ($flag_SI) {
320 if ($self->interleaved() ) {
321 $self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length));
322 } else {
323 $self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length));
325 } else {
326 $self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length));
329 $idlength = $self->idlength();
330 $line_length = $self->line_length();
331 $tag_length = $self->tag_length();
332 foreach $seq ( $aln->each_seq() ) {
333 $name = $aln->displayname($seq->get_nse);
334 if ($self->longid) {
335 $self->warn("The length of the name is over 50 chars long [$name]")
336 if length($name) > 50;
337 $name = "'$name' "
338 } else {
339 $name = substr($name, 0, $idlength) if length($name) > $idlength;
340 $name = sprintf("%-".$idlength."s",$name);
341 if( $self->interleaved() ) {
342 $name .= ' ' ;
343 } elsif( $self->id_linebreak) {
344 $name .= "\n";
347 #phylip needs dashes not dots
348 my $seq = $seq->seq();
349 $seq =~ s/\./-/g;
350 $hash{$name} = $seq;
351 push(@arr,$name);
354 if( $self->interleaved() ) {
355 my $numtags;
356 if ($tag_length <= $line_length) {
357 $numtags = floor($line_length/$tag_length);
358 $line_length = $tag_length*$numtags;
359 } else {
360 $numtags = 1;
362 while( $count < $length ) {
364 # there is another block to go!
365 foreach $name ( @arr ) {
366 my $dispname = $name;
367 $dispname = '' if $wrapped;
368 $self->_print (sprintf("%".($idlength+3)."s",$dispname));
369 $tempcount = $count;
370 $index = 0;
371 $self->debug("residue count: $count\n") if ($count%100000 == 0);
372 while( ($tempcount + $tag_length < $length) &&
373 ($index < $numtags) ) {
374 $self->_print (sprintf("%s ",substr($hash{$name},
375 $tempcount,
376 $tag_length)));
377 $tempcount += $tag_length;
378 $index++;
380 # last
381 if( $index < $numtags) {
382 # space to print!
383 $self->_print (sprintf("%s",substr($hash{$name},
384 $tempcount)));
385 $tempcount += $tag_length;
387 $self->_print ("\n");
389 $self->_print ("\n");
390 $count = $tempcount;
391 $wrapped = 1;
393 } else {
394 foreach $name ( @arr ) {
395 my $dispname = $name;
396 my $line = sprintf("%s%s\n",$dispname,$hash{$name});
397 if( $self->wrap_sequential ) {
398 $line =~ s/(.{1,$width})/$1\n/g;
400 $self->_print ($line);
404 $self->flush if $self->_flush_on_write && defined $self->_fh;
405 return 1;
408 =head2 interleaved
410 Title : interleaved
411 Usage : my $interleaved = $obj->interleaved
412 Function: Get/Set Interleaved status
413 Returns : boolean
414 Args : boolean
417 =cut
419 sub interleaved {
420 my ($self,$value) = @_;
421 if( defined $value ) {
422 if ($value) {$self->{'_interleaved'} = 1 }
423 else {$self->{'_interleaved'} = 0 }
425 return 1 unless defined $self->{'_interleaved'};
426 return $self->{'_interleaved'};
429 =head2 flag_SI
431 Title : flag_SI
432 Usage : my $flag = $obj->flag_SI
433 Function: Get/Set if the Sequential/Interleaved flag has to be shown
434 after the number of sequences and sequence length
435 Example :
436 Returns : boolean
437 Args : boolean
440 =cut
442 sub flag_SI{
443 my ($self,$value) = @_;
444 my $previous = $self->{'_flag_SI'};
445 if( defined $value ) {
446 $self->{'_flag_SI'} = $value;
448 return $previous;
451 =head2 idlength
453 Title : idlength
454 Usage : my $idlength = $obj->idlength
455 Function: Get/Set value of id length
456 Returns : string
457 Args : string
460 =cut
462 sub idlength {
463 my($self,$value) = @_;
464 if (defined $value){
465 $self->{'_idlength'} = $value;
467 return $self->{'_idlength'};
470 =head2 line_length
472 Title : line_length
473 Usage : $obj->line_length($newval)
474 Function:
475 Returns : value of line_length
476 Args : newvalue (optional)
479 =cut
481 sub line_length{
482 my ($self,$value) = @_;
483 if( defined $value) {
484 $self->{'_line_length'} = $value;
486 return $self->{'_line_length'} || $DEFAULTLINELEN;
490 =head2 tag_length
492 Title : tag_length
493 Usage : $obj->tag_length($newval)
494 Function:
495 Example : my $tag_length = $obj->tag_length
496 Returns : value of the length for each space-separated tag in a line
497 Args : newvalue (optional) - set to zero to have one tag per line
500 =cut
502 sub tag_length{
503 my ($self,$value) = @_;
504 if( defined $value) {
505 $self->{'_tag_length'} = $value;
507 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
511 =head2 id_linebreak
513 Title : id_linebreak
514 Usage : $obj->id_linebreak($newval)
515 Function:
516 Returns : value of id_linebreak
517 Args : newvalue (optional)
520 =cut
522 sub id_linebreak{
523 my ($self,$value) = @_;
524 if( defined $value) {
525 $self->{'_id_linebreak'} = $value;
527 return $self->{'_id_linebreak'} || 0;
531 =head2 wrap_sequential
533 Title : wrap_sequential
534 Usage : $obj->wrap_sequential($newval)
535 Function:
536 Returns : value of wrap_sequential
537 Args : newvalue (optional)
540 =cut
542 sub wrap_sequential{
543 my ($self,$value) = @_;
544 if( defined $value) {
545 $self->{'_wrap_sequential'} = $value;
547 return $self->{'_wrap_sequential'} || 0;
550 =head2 longid
552 Title : longid
553 Usage : $obj->longid($newval)
554 Function:
555 Returns : value of longid
556 Args : newvalue (optional)
559 =cut
561 sub longid{
562 my ($self,$value) = @_;
563 if( defined $value) {
564 $self->{'_longid'} = $value;
566 return $self->{'_longid'} || 0;