reimplement various methods in terms of get_dbxrefs, for consistency
[bioperl-live.git] / Bio / AlignIO / phylip.pm
blob7228fa45108412e58a25eface9c6db87a5b815ea
1 # $Id$
3 # BioPerl module for Bio::AlignIO::phylip
5 # Copyright Heikki Lehvaslaiho
8 =head1 NAME
10 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
12 =head1 SYNOPSIS
14 # Do not use this module directly. Use it via the Bio::AlignIO class.
16 use Bio::AlignIO;
17 use Bio::SimpleAlign;
18 #you can set the name length to something other than the default 10
19 #if you use a version of phylip (hacked) that accepts ids > 10
20 my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
21 -fh => \*STDOUT,
22 -idlength=>30);
23 # convert data from one format to another
24 my $gcgstream = Bio::AlignIO->new(-format => 'msf',
25 -file => 't/data/cysprot1a.msf');
27 while( my $aln = $gcgstream->next_aln ) {
28 $phylipstream->write_aln($aln);
31 # do it again with phylip sequential format format
32 $phylipstream->interleaved(0);
33 # can also initialize the object like this
34 $phylipstream = Bio::AlignIO->new(-interleaved => 0,
35 -format => 'phylip',
36 -fh => \*STDOUT,
37 -idlength=>10);
38 $gcgstream = Bio::AlignIO->new(-format => 'msf',
39 -file => 't/data/cysprot1a.msf');
41 while( my $aln = $gcgstream->next_aln ) {
42 $phylipstream->write_aln($aln);
45 =head1 DESCRIPTION
47 This object can transform Bio::SimpleAlign objects to and from PHYLIP
48 fotmat. By deafult it works with the interleaved format. By specifying
49 the flag -interleaved =E<gt> 0 in the initialization the module can
50 read or write data in sequential format.
52 Long IDs up to 50 characters are supported by flag -longid =E<gt>
53 1. ID strings can be surrounded by single quoted. They are mandatory
54 only if the IDs contain spaces.
56 =head1 FEEDBACK
58 =head2 Support
60 Please direct usage questions or support issues to the mailing list:
62 I<bioperl-l@bioperl.org>
64 rather than to the module maintainer directly. Many experienced and
65 reponsive experts will be able look at the problem and quickly
66 address it. Please include a thorough description of the problem
67 with code and data examples if at all possible.
69 =head2 Reporting Bugs
71 Report bugs to the Bioperl bug tracking system to help us keep track
72 the bugs and their resolution. Bug reports can be submitted via the
73 web:
75 http://bugzilla.open-bio.org/
77 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
79 Email: heikki at ebi.ac.uk
80 Email: jason at bioperl.org
82 =head1 APPENDIX
84 The rest of the documentation details each of the object
85 methods. Internal methods are usually preceded with a _
87 =cut
89 # Let the code begin...
91 package Bio::AlignIO::phylip;
92 use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
93 use strict;
95 use Bio::SimpleAlign;
96 use POSIX; # for the rounding call
98 use base qw(Bio::AlignIO);
100 BEGIN {
101 $DEFAULTIDLENGTH = 10;
102 $DEFAULTLINELEN = 60;
103 $DEFAULTTAGLEN = 10;
106 =head2 new
108 Title : new
109 Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
110 -file => '>file',
111 -idlength => 10,
112 -idlinebreak => 1);
113 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
114 Returns : L<Bio::AlignIO> object
115 Args : [specific for writing of phylip format files]
116 -idlength => integer - length of the id (will pad w/
117 spaces if needed)
118 -interleaved => boolean - whether interleaved
119 or sequential format required
120 -line_length => integer of how long a sequence lines should be
121 -idlinebreak => insert a line break after the sequence id
122 so that sequence starts on the next line
123 -flag_SI => whether or not write a "S" or "I" just after
124 the num.seq. and line len., in the first line
125 -tag_length => integer of how long the tags have to be in
126 each line between the space separator. set it
127 to 0 to have 1 tag only.
128 -wrap_sequential => boolean for whether or not sequential
129 format should be broken up or a single line
130 default is false (single line)
131 -longid => boolean for allowing arbitrary long IDs (default is false)
133 =cut
135 sub _initialize {
136 my($self,@args) = @_;
137 $self->SUPER::_initialize(@args);
139 my ($interleave,$linelen,$idlinebreak,
140 $idlength, $flag_SI, $tag_length,$ws, $longid) =
141 $self->_rearrange([qw(INTERLEAVED
142 LINE_LENGTH
143 IDLINEBREAK
144 IDLENGTH
145 FLAG_SI
146 TAG_LENGTH
147 WRAP_SEQUENTIAL
148 LONGID)],@args);
149 $self->interleaved($interleave ? 1 : 0) if defined $interleave;
150 $self->idlength($idlength || $DEFAULTIDLENGTH);
151 $self->id_linebreak(1) if( $idlinebreak );
152 $self->line_length($linelen) if defined $linelen && $linelen > 0;
153 $self->flag_SI(1) if ( $flag_SI );
154 $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN );
155 $self->wrap_sequential($ws ? 1 : 0);
156 $self->longid($longid ? 1 : 0);
160 =head2 next_aln
162 Title : next_aln
163 Usage : $aln = $stream->next_aln()
164 Function: returns the next alignment in the stream.
165 Throws an exception if trying to read in PHYLIP
166 sequential format.
167 Returns : L<Bio::SimpleAlign> object
168 Args :
170 =cut
172 sub next_aln {
173 my $self = shift;
174 my $entry;
175 my ($seqcount, $residuecount, %hash, $name,$str,
176 @names,$seqname,$start,$end,$count,$seq);
178 my $aln = Bio::SimpleAlign->new(-source => 'phylip');
180 # skip blank lines until we see header line
181 # if we see a non-blank line that isn't the seqcount and residuecount line
182 # then bail out of next_aln (return)
183 HEADER: while ($entry = $self->_readline) {
184 next if $entry =~ /^\s?$/;
185 if ($entry =~ /\s*(\d+)\s+(\d+)/) {
186 ($seqcount, $residuecount) = ($1, $2);
189 last HEADER;
191 return unless $seqcount and $residuecount;
193 # first alignment section
194 my $idlen = $self->idlength;
195 $count = 0;
196 my $iter = 1;
197 my $interleaved = $self->interleaved;
198 while( $entry = $self->_readline) {
199 last if( $entry =~ /^\s?$/ && $interleaved );
201 # we've hit the next entry.
202 if( $entry =~ /^\s+(\d+)\s+(\d+)\s*$/) {
203 $self->_pushback($entry);
204 last;
206 if( $self->longid && $entry =~ /\w/ ) {
207 if ($entry =~ /'/) {
208 $entry =~ /^\s*'([^']+)'\s+(.+)$/;
209 $name = $1;
210 $str = $2;
211 } else {
212 $entry =~ /^\s*([^\s]+)\s+(.+)$/;
213 $name = $1;
214 $str = $2;
216 # $name =~ s/[\s\/]/_/g; # not sure how wise is it to do this
217 $name =~ s/_+$//; # remove any trailing _'s
219 push @names, $name;
220 $str =~ s/\s//g;
221 $count = scalar @names;
222 $hash{$count} = $str;
224 } elsif( $entry =~ /^\s+(.+)$/ ) {
225 $interleaved = 0;
226 $str = $1;
227 $str =~ s/\s//g;
228 $count = scalar @names;
229 $hash{$count} .= $str;
230 } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ||
231 $entry =~ /^(.{$idlen})(\S{$idlen}\s+.+)\s$/ # Handle weirdnes s when id is too long
233 $name = $1;
234 $str = $2;
235 $name =~ s/[\s\/]/_/g;
236 $name =~ s/_+$//; # remove any trailing _'s
238 push @names, $name;
239 $str =~ s/\s//g;
240 $count = scalar @names;
241 $hash{$count} = $str;
242 } elsif( $interleaved ) {
243 if( $entry =~ /^(\S+)\s+(.+)/ ||
244 $entry =~ /^(.{$idlen})(.*)\s$/ ) {
245 $name = $1;
246 $str = $2;
247 $name =~ s/[\s\/]/_/g;
248 $name =~ s/_+$//; # remove any trailing _'s
249 push @names, $name;
250 $str =~ s/\s//g;
251 $count = scalar @names;
252 $hash{$count} = $str;
253 } else {
254 $self->debug("unmatched line: $entry");
257 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount;
260 if( $interleaved ) {
261 # interleaved sections
262 $count = 0;
263 while( $entry = $self->_readline) {
264 # finish current entry
265 if($entry =~/\s*\d+\s+\d+/){
266 $self->_pushback($entry);
267 last;
269 $count = 0, next if $entry =~ /^\s$/;
270 $entry =~ /\s*(.*)$/ && do {
271 $str = $1;
272 $str =~ s/\s//g;
273 $count++;
274 $hash{$count} .= $str;
276 $self->throw("Not a valid interleaved PHYLIP file! [$count,$seqcount] ($entry)") if $count > $seqcount;
279 return if scalar @names < 1;
281 # sequence creation
282 $count = 0;
283 foreach $name ( @names ) {
284 $count++;
285 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
286 $seqname = $1;
287 $start = $2;
288 $end = $3;
289 } else {
290 $seqname=$name;
291 $start = 1;
292 $str = $hash{$count};
293 $str =~ s/[^A-Za-z]//g;
294 $end = length($str);
296 # consistency test
297 $self->throw("Length of sequence [$seqname] is not [$residuecount] it is ".CORE::length($hash{$count})."! ")
298 unless CORE::length($hash{$count}) == $residuecount;
300 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
301 '-display_id' => $seqname,
302 '-start' => $start,
303 '-end' => $end,
304 '-alphabet' => $self->alphabet,
306 $aln->add_seq($seq);
309 return $aln if $aln->num_sequences;
310 return;
314 =head2 write_aln
316 Title : write_aln
317 Usage : $stream->write_aln(@aln)
318 Function: writes the $aln object into the stream in phylip format
319 Returns : 1 for success and 0 for error
320 Args : L<Bio::Align::AlignI> object
322 =cut
324 sub write_aln {
325 my ($self,@aln) = @_;
326 my $count = 0;
327 my $wrapped = 0;
328 my $maxname;
329 my $width = $self->line_length();
330 my ($length,$date,$name,$seq,$miss,$pad,
331 %hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length);
333 foreach my $aln (@aln) {
334 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
335 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
336 next;
338 $self->throw("All sequences in the alignment must be the same length")
339 unless $aln->is_flush(1) ;
341 $flag_SI = $self->flag_SI();
342 $aln->set_displayname_flat(); # plain
343 $length = $aln->length();
344 if ($flag_SI) {
345 if ($self->interleaved() ) {
346 $self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length));
347 } else {
348 $self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length));
350 } else {
351 $self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length));
354 $idlength = $self->idlength();
355 $line_length = $self->line_length();
356 $tag_length = $self->tag_length();
357 foreach $seq ( $aln->each_seq() ) {
358 $name = $aln->displayname($seq->get_nse);
359 if ($self->longid) {
360 $self->warn("The lenght of the name is over 50 chars long [$name]")
361 if length($name) > 50;
362 $name = "'$name' "
363 } else {
364 $name = substr($name, 0, $idlength) if length($name) > $idlength;
365 $name = sprintf("%-".$idlength."s",$name);
366 if( $self->interleaved() ) {
367 $name .= ' ' ;
368 } elsif( $self->id_linebreak) {
369 $name .= "\n";
372 #phylip needs dashes not dots
373 my $seq = $seq->seq();
374 $seq =~ s/\./-/g;
375 $hash{$name} = $seq;
376 push(@arr,$name);
379 if( $self->interleaved() ) {
380 my $numtags;
381 if ($tag_length <= $line_length) {
382 $numtags = floor($line_length/$tag_length);
383 $line_length = $tag_length*$numtags;
384 } else {
385 $numtags = 1;
387 while( $count < $length ) {
389 # there is another block to go!
390 foreach $name ( @arr ) {
391 my $dispname = $name;
392 $dispname = '' if $wrapped;
393 $self->_print (sprintf("%".($idlength+3)."s",$dispname));
394 $tempcount = $count;
395 $index = 0;
396 $self->debug("residue count: $count\n") if ($count%100000 == 0);
397 while( ($tempcount + $tag_length < $length) &&
398 ($index < $numtags) ) {
399 $self->_print (sprintf("%s ",substr($hash{$name},
400 $tempcount,
401 $tag_length)));
402 $tempcount += $tag_length;
403 $index++;
405 # last
406 if( $index < $numtags) {
407 # space to print!
408 $self->_print (sprintf("%s ",substr($hash{$name},
409 $tempcount)));
410 $tempcount += $tag_length;
412 $self->_print ("\n");
414 $self->_print ("\n");
415 $count = $tempcount;
416 $wrapped = 1;
418 } else {
419 foreach $name ( @arr ) {
420 my $dispname = $name;
421 my $line = sprintf("%s%s\n",$dispname,$hash{$name});
422 if( $self->wrap_sequential ) {
423 $line =~ s/(.{1,$width})/$1\n/g;
425 $self->_print ($line);
429 $self->flush if $self->_flush_on_write && defined $self->_fh;
430 return 1;
433 =head2 interleaved
435 Title : interleaved
436 Usage : my $interleaved = $obj->interleaved
437 Function: Get/Set Interleaved status
438 Returns : boolean
439 Args : boolean
442 =cut
444 sub interleaved {
445 my ($self,$value) = @_;
446 if( defined $value ) {
447 if ($value) {$self->{'_interleaved'} = 1 }
448 else {$self->{'_interleaved'} = 0 }
450 return 1 unless defined $self->{'_interleaved'};
451 return $self->{'_interleaved'};
454 =head2 flag_SI
456 Title : flag_SI
457 Usage : my $flag = $obj->flag_SI
458 Function: Get/Set if the Sequential/Interleaved flag has to be shown
459 after the number of sequences and sequence length
460 Example :
461 Returns : boolean
462 Args : boolean
465 =cut
467 sub flag_SI{
468 my ($self,$value) = @_;
469 my $previous = $self->{'_flag_SI'};
470 if( defined $value ) {
471 $self->{'_flag_SI'} = $value;
473 return $previous;
476 =head2 idlength
478 Title : idlength
479 Usage : my $idlength = $obj->idlength
480 Function: Get/Set value of id length
481 Returns : string
482 Args : string
485 =cut
487 sub idlength {
488 my($self,$value) = @_;
489 if (defined $value){
490 $self->{'_idlength'} = $value;
492 return $self->{'_idlength'};
495 =head2 line_length
497 Title : line_length
498 Usage : $obj->line_length($newval)
499 Function:
500 Returns : value of line_length
501 Args : newvalue (optional)
504 =cut
506 sub line_length{
507 my ($self,$value) = @_;
508 if( defined $value) {
509 $self->{'_line_length'} = $value;
511 return $self->{'_line_length'} || $DEFAULTLINELEN;
515 =head2 tag_length
517 Title : tag_length
518 Usage : $obj->tag_length($newval)
519 Function:
520 Example : my $tag_length = $obj->tag_length
521 Returns : value of the length for each space-separated tag in a line
522 Args : newvalue (optional) - set to zero to have one tag per line
525 =cut
527 sub tag_length{
528 my ($self,$value) = @_;
529 if( defined $value) {
530 $self->{'_tag_length'} = $value;
532 return $self->{'_tag_length'} || $DEFAULTTAGLEN;
536 =head2 id_linebreak
538 Title : id_linebreak
539 Usage : $obj->id_linebreak($newval)
540 Function:
541 Returns : value of id_linebreak
542 Args : newvalue (optional)
545 =cut
547 sub id_linebreak{
548 my ($self,$value) = @_;
549 if( defined $value) {
550 $self->{'_id_linebreak'} = $value;
552 return $self->{'_id_linebreak'} || 0;
556 =head2 wrap_sequential
558 Title : wrap_sequential
559 Usage : $obj->wrap_sequential($newval)
560 Function:
561 Returns : value of wrap_sequential
562 Args : newvalue (optional)
565 =cut
567 sub wrap_sequential{
568 my ($self,$value) = @_;
569 if( defined $value) {
570 $self->{'_wrap_sequential'} = $value;
572 return $self->{'_wrap_sequential'} || 0;
575 =head2 longid
577 Title : longid
578 Usage : $obj->longid($newval)
579 Function:
580 Returns : value of longid
581 Args : newvalue (optional)
584 =cut
586 sub longid{
587 my ($self,$value) = @_;
588 if( defined $value) {
589 $self->{'_longid'} = $value;
591 return $self->{'_longid'} || 0;