maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / AlignIO / phylip.pm
blobd694b006278679f152e8dce2ca0a26a8e5f4ffca
2 # BioPerl module for Bio::AlignIO::phylip
4 # Copyright Heikki Lehvaslaiho
7 =head1 NAME
9 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
11 =head1 SYNOPSIS
13 Do not use this module directly. Use it via the Bio::AlignIO class.
15 This example shows how to write to phylip format:
17 use Bio::AlignIO;
18 use Bio::SimpleAlign;
20 # Use -idlength to set the name length to something other than
21 # the default 10 if you need longer ids.
22 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
23 -fh => \*STDOUT,
24 -idlength => 30);
25 # Convert data from one format to another
26 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
27 -file => 't/data/cysprot1a.msf');
29 while( my $aln = $gcgstream->next_aln ) {
30 $phylipstream->write_aln($aln);
33 # With phylip sequential format format
34 $phylipstream->interleaved(0);
35 # Or initialize the object like this
36 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
37 -format => 'phylip',
38 -fh => \*STDOUT,
39 -idlength => 20 );
40 $gcgstream = Bio::AlignIO->new(-format => 'msf',
41 -file => 't/data/cysprot1a.msf');
43 while( my $aln = $gcgstream->next_aln ) {
44 $phylipstream->write_aln($aln);
47 This example shows how to read phylip format:
49 my $in = Bio::AlignIO->new(
50 -file => $inFile,
51 -format => 'phylip',
52 -interleaved => 0,
53 -longid => 1
56 my $out = Bio::AlignIO->new(
57 -file => ">$outFile",
58 -format => 'fasta'
61 while ( my $aln = $in->next_aln() ) {
62 $out->write_aln($aln);
65 The -longid argument is required if the input phylip format file
66 has ids with lengths greater then 10 characters.
68 =head1 DESCRIPTION
70 This object can transform Bio::SimpleAlign objects to and from PHYLIP
71 format. By default it works with the interleaved format. By specifying
72 the flag -interleaved =E<gt> 0 in the initialization the module can
73 read or write data in sequential format.
75 Reading phylip format with long IDs up to 50 characters is supported by
76 the flag -longid =E<gt>1. ID strings can be surrounded by single quotes.
77 They are mandatory only if the IDs contain spaces.
79 =head1 FEEDBACK
81 =head2 Support
83 Please direct usage questions or support issues to the mailing list:
85 I<bioperl-l@bioperl.org>
87 rather than to the module maintainer directly. Many experienced and
88 reponsive experts will be able look at the problem and quickly
89 address it. Please include a thorough description of the problem
90 with code and data examples if at all possible.
92 =head2 Reporting Bugs
94 Report bugs to the Bioperl bug tracking system to help us keep track
95 the bugs and their resolution. Bug reports can be submitted via the
96 web:
98 https://github.com/bioperl/bioperl-live/issues
100 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
102 Email: heikki at ebi.ac.uk
103 Email: jason at bioperl.org
105 =head1 APPENDIX
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
110 =cut
112 # Let the code begin...
114 package Bio::AlignIO::phylip;
115 use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
116 use strict;
118 use Bio::SimpleAlign;
119 use POSIX; # for the rounding call
121 use base qw(Bio::AlignIO);
123 BEGIN {
124 $DEFAULTIDLENGTH = 10;
125 $DEFAULTLINELEN = 60;
126 $DEFAULTTAGLEN = 10;
129 =head2 new
131 Title : new
132 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
133 -file => '>file',
134 -idlength => 10,
135 -idlinebreak => 1);
136 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
137 Returns : L<Bio::AlignIO> object
138 Args : [specific for writing of phylip format files]
139 -idlength => integer - length of the id (will pad w/
140 spaces if needed) when writing phylip
141 -interleaved => boolean - whether interleaved
142 or sequential format required
143 -line_length => integer of how long a sequence lines should be
144 -idlinebreak => insert a line break after the sequence id
145 so that sequence starts on the next line
146 -flag_SI => whether or not write a "S" or "I" just after
147 the num.seq. and line len., in the first line
148 -tag_length => integer of how long the tags have to be in
149 each line between the space separator. set it
150 to 0 to have 1 tag only.
151 -wrap_sequential => boolean for whether or not sequential
152 format should be broken up or a single line
153 default is false (single line)
154 -longid => boolean to read arbitrary long IDs (default is false)
156 =cut
158 sub _initialize {
159 my ( $self, @args ) = @_;
160 $self->SUPER::_initialize(@args);
162 my ( $interleave, $linelen, $idlinebreak,
163 $idlength, $flag_SI, $tag_length, $ws, $longid )
164 = $self->_rearrange(
165 [ qw(INTERLEAVED
166 LINE_LENGTH
167 IDLINEBREAK
168 IDLENGTH
169 FLAG_SI
170 TAG_LENGTH
171 WRAP_SEQUENTIAL
172 LONGID)
174 @args
176 $self->interleaved( $interleave ? 1 : 0 ) if defined $interleave;
177 $self->idlength( $idlength || $DEFAULTIDLENGTH );
178 $self->id_linebreak(1) if ($idlinebreak);
179 $self->line_length($linelen) if defined $linelen && $linelen > 0;
180 $self->flag_SI(1) if ($flag_SI);
181 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
182 $self->wrap_sequential( $ws ? 1 : 0 );
183 $self->longid( $longid ? 1 : 0 );
187 =head2 next_aln
189 Title : next_aln
190 Usage : $aln = $stream->next_aln()
191 Function: returns the next alignment in the stream.
192 Throws an exception if trying to read in PHYLIP
193 sequential format.
194 Returns : L<Bio::SimpleAlign> object
195 Args :
197 =cut
199 sub next_aln {
200 my $self = shift;
201 my $entry;
202 my ($seqcount, $residuecount, %hash, $name,
203 $str, @names, $seqname, $start,
204 $end, $count, $seq
207 my $aln = Bio::SimpleAlign->new( -source => 'phylip' );
209 # First, parse up through the header.
210 # If we see a non-blank line that isn't the seqcount and residuecount line
211 # then bail out of next_aln (return)
212 while ( $entry = $self->_readline ) {
213 if ( $entry =~ /^\s?$/ ) {
214 next;
215 } elsif ( $entry =~ /\s*(\d+)\s+(\d+)/ ) {
216 ( $seqcount, $residuecount ) = ( $1, $2 );
217 last;
218 } else {
219 $self->warn(
220 "Failed to parse PHYLIP: Did not see a sequence count and residue count."
222 return;
225 return unless $seqcount and $residuecount;
227 # First alignment section. We expect to see a name and (part of) a sequence.
228 my $idlen = $self->idlength;
229 $count = 0;
231 while ( $entry = $self->_readline ) {
232 if ( $entry =~ /^\s?$/ ) { # eat the newlines
233 next;
236 # Names can be in a few different formats:
237 # 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
238 # 2. they can be hacked with a long ID, as passed in with the flag -longid.
239 # 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
240 if ( $self->longid() ) { # 2 or 3
241 if ( $entry =~ /^'(.+)'\s+(.+)$/ ) { # 3. name has single quotes.
242 $name = $1;
243 $str = $2;
244 } else { # 2. name does not have single quotes, so should not have spaces.
245 # therefore, the first part of the line is the name and the rest is the seq.
246 # make sure that the line does not lead with extra spaces.
247 $entry =~ s/^\s+//;
248 ( $name, $str ) = split( /\s+/, $entry, 2 );
250 } else { # 1. traditional phylip.
251 $entry =~ /^(.{1,10})\s(.+)$/;
252 $name = $1;
253 $str = $2;
254 $name =~ s/\s+$//; # eat any trailing spaces
255 $name =~ s/\s+/_/g;
257 push @names, $name;
259 #clean sequence of spaces:
260 $str =~ s/\s+//g;
262 # are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
263 if ( ( $self->interleaved ) == 0 ) {
264 while ( length($str) < $residuecount ) {
265 $entry = $self->_readline;
266 $str .= $entry;
267 $str =~ s/\s+//g;
268 if ( $entry =~ /^\s*$/ ) { # we ran into a newline before we got a complete sequence: bail!
269 $self->warn(
270 "Failed to parse PHYLIP: Sequence $name was shorter than expected: "
271 . length($str)
272 . " instead of $residuecount." );
273 last;
277 $hash{$count} = $str;
279 $count++;
281 # if we've read as many seqs as we're supposed to, move on.
282 if ( $count == $seqcount ) {
283 last;
287 # if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
288 if ( $self->interleaved ) {
289 while ( length( $hash{ $seqcount - 1 } ) < $residuecount ) {
290 $count = 0;
291 while ( $entry = $self->_readline ) {
292 if ( $entry =~ /^\s*$/ ) { # eat newlines
293 if ( $count != 0 ) { # there was a newline at an unexpected place!
294 $self->warn(
295 "Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount."
297 return;
299 next;
300 } else { # start taking in chunks
301 $entry =~ s/\s//g;
302 $hash{$count} .= $entry;
303 $count++;
305 if ( $count >= $seqcount ) { # we've read all of the sequences for this chunk, so move on.
306 last;
311 if ( ( scalar @names ) != $seqcount ) {
312 $self->warn(
313 "Failed to parse PHYLIP: Did not see the correct number of seqs: saw "
314 . scalar(@names)
315 . ", expected $seqcount." );
316 return;
318 for ( $count = 0; $count < $seqcount; $count++ ) {
319 $str = $hash{$count};
320 my $seqname = $names[$count];
321 if ( length($str) != $residuecount ) {
322 $self->warn(
323 "Failed to parse PHYLIP: Sequence $seqname was the wrong length: "
324 . length($str)
325 . " instead of $residuecount." );
327 $seq = Bio::LocatableSeq->new(
328 '-seq' => $hash{$count},
329 '-display_id' => $seqname
331 $aln->add_seq($seq);
333 return $aln;
336 =head2 write_aln
338 Title : write_aln
339 Usage : $stream->write_aln(@aln)
340 Function: writes the $aln object into the stream in phylip format
341 Returns : 1 for success and 0 for error
342 Args : L<Bio::Align::AlignI> object
344 =cut
346 sub write_aln {
347 my ( $self, @aln ) = @_;
348 my $count = 0;
349 my $wrapped = 0;
350 my $maxname;
351 my $width = $self->line_length();
352 my ($length, $date, $name, $seq, $miss,
353 $pad, %hash, @arr, $tempcount, $index,
354 $idlength, $flag_SI, $line_length, $tag_length
357 foreach my $aln (@aln) {
358 if ( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
359 $self->warn(
360 "Must provide a Bio::Align::AlignI object when calling write_aln"
362 next;
364 $self->throw("All sequences in the alignment must be the same length")
365 unless $aln->is_flush(1);
367 $flag_SI = $self->flag_SI();
368 $aln->set_displayname_flat(); # plain
369 $length = $aln->length();
370 if ($flag_SI) {
371 if ( $self->interleaved() ) {
372 $self->_print(
373 sprintf(
374 " %s %s I\n", $aln->num_sequences, $aln->length
377 } else {
378 $self->_print(
379 sprintf(
380 " %s %s S\n", $aln->num_sequences, $aln->length
384 } else {
385 $self->_print(
386 sprintf( " %s %s\n", $aln->num_sequences, $aln->length ) );
389 $idlength = $self->idlength();
390 $line_length = $self->line_length();
391 $tag_length = $self->tag_length();
392 foreach $seq ( $aln->each_seq() ) {
393 $name = $aln->displayname( $seq->get_nse );
394 if ( $self->longid ) {
395 $self->warn(
396 "The length of the name is over 50 chars long [$name]")
397 if length($name) > 50;
398 $name = "'$name' ";
399 } else {
400 $name = substr( $name, 0, $idlength )
401 if length($name) > $idlength;
402 $name = sprintf( "%-" . $idlength . "s", $name );
403 if ( $self->interleaved() ) {
404 $name .= ' ';
405 } elsif ( $self->id_linebreak ) {
406 $name .= "\n";
410 #phylip needs dashes not dots
411 my $seq = $seq->seq();
412 $seq =~ s/\./-/g;
413 $hash{$name} = $seq;
414 push( @arr, $name );
417 if ( $self->interleaved() ) {
418 my $numtags;
419 if ( $tag_length <= $line_length ) {
420 $numtags = floor( $line_length / $tag_length );
421 $line_length = $tag_length * $numtags;
422 } else {
423 $numtags = 1;
425 while ( $count < $length ) {
427 # there is another block to go!
428 foreach $name (@arr) {
429 my $dispname = $name;
430 $dispname = '' if $wrapped;
431 $self->_print(
432 sprintf( "%" . ( $idlength + 3 ) . "s", $dispname ) );
433 $tempcount = $count;
434 $index = 0;
435 $self->debug("residue count: $count\n")
436 if ( $count % 100000 == 0 );
437 while (( $tempcount + $tag_length < $length )
438 && ( $index < $numtags ) ) {
439 $self->_print(
440 sprintf(
441 "%s ",
442 substr(
443 $hash{$name}, $tempcount, $tag_length
447 $tempcount += $tag_length;
448 $index++;
451 # last
452 if ( $index < $numtags ) {
454 # space to print!
455 $self->_print(
456 sprintf( "%s",
457 substr( $hash{$name}, $tempcount ) )
459 $tempcount += $tag_length;
461 $self->_print("\n");
463 $self->_print("\n");
464 $count = $tempcount;
465 $wrapped = 1;
467 } else {
468 foreach $name (@arr) {
469 my $dispname = $name;
470 my $line = sprintf( "%s%s\n", $dispname, $hash{$name} );
471 if ( $self->wrap_sequential ) {
472 $line =~ s/(.{1,$width})/$1\n/g;
474 $self->_print($line);
478 $self->flush if $self->_flush_on_write && defined $self->_fh;
479 return 1;
482 =head2 interleaved
484 Title : interleaved
485 Usage : my $interleaved = $obj->interleaved
486 Function: Get/Set Interleaved status
487 Returns : boolean
488 Args : boolean
491 =cut
493 sub interleaved {
494 my ( $self, $value ) = @_;
495 if ( defined $value ) {
496 if ($value) {
497 $self->{'_interleaved'} = 1
498 } else {
499 $self->{'_interleaved'} = 0
502 return 1 unless defined $self->{'_interleaved'};
503 return $self->{'_interleaved'};
506 =head2 flag_SI
508 Title : flag_SI
509 Usage : my $flag = $obj->flag_SI
510 Function: Get/Set if the Sequential/Interleaved flag has to be shown
511 after the number of sequences and sequence length
512 Example :
513 Returns : boolean
514 Args : boolean
517 =cut
519 sub flag_SI {
520 my ( $self, $value ) = @_;
521 my $previous = $self->{'_flag_SI'};
522 if ( defined $value ) {
523 $self->{'_flag_SI'} = $value;
525 return $previous;
528 =head2 idlength
530 Title : idlength
531 Usage : my $idlength = $obj->idlength
532 Function: Get/Set value of id length
533 Returns : string
534 Args : string
537 =cut
539 sub idlength {
540 my ( $self, $value ) = @_;
541 if ( defined $value ) {
542 $self->{'_idlength'} = $value;
544 return $self->{'_idlength'};
547 =head2 line_length
549 Title : line_length
550 Usage : $obj->line_length($newval)
551 Function:
552 Returns : value of line_length
553 Args : newvalue (optional)
556 =cut
558 sub line_length {
559 my ( $self, $value ) = @_;
560 if ( defined $value ) {
561 $self->{'_line_length'} = $value;
563 return $self->{'_line_length'} || $DEFAULTLINELEN;
567 =head2 tag_length
569 Title : tag_length
570 Usage : $obj->tag_length($newval)
571 Function:
572 Example : my $tag_length = $obj->tag_length
573 Returns : value of the length for each space-separated tag in a line
574 Args : newvalue (optional) - set to zero to have one tag per line
577 =cut
579 sub tag_length {
580 my ( $self, $value ) = @_;
581 if ( defined $value ) {
582 $self->{'_tag_length'} = $value;
584 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
587 =head2 id_linebreak
589 Title : id_linebreak
590 Usage : $obj->id_linebreak($newval)
591 Function:
592 Returns : value of id_linebreak
593 Args : newvalue (optional)
596 =cut
598 sub id_linebreak {
599 my ( $self, $value ) = @_;
600 if ( defined $value ) {
601 $self->{'_id_linebreak'} = $value;
603 return $self->{'_id_linebreak'} || 0;
606 =head2 wrap_sequential
608 Title : wrap_sequential
609 Usage : $obj->wrap_sequential($newval)
610 Function:
611 Returns : value of wrap_sequential
612 Args : newvalue (optional)
615 =cut
617 sub wrap_sequential {
618 my ( $self, $value ) = @_;
619 if ( defined $value ) {
620 $self->{'_wrap_sequential'} = $value;
622 return $self->{'_wrap_sequential'} || 0;
625 =head2 longid
627 Title : longid
628 Usage : $obj->longid($newval)
629 Function:
630 Returns : value of longid
631 Args : newvalue (optional)
634 =cut
636 sub longid {
637 my ( $self, $value ) = @_;
638 if ( defined $value ) {
639 $self->{'_longid'} = $value;
641 return $self->{'_longid'} || 0;