fix filehandle issue for NEXUS parsing
[bioperl-live.git] / Bio / AlignIO / nexus.pm
blob77df679d9214929bc43d69cfbe3f9ed1da8cfa0b
1 # $Id$
3 # BioPerl module for Bio::AlignIO::nexus
5 # Copyright Heikki Lehvaslaiho
8 =head1 NAME
10 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
12 =head1 SYNOPSIS
14 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
16 use Bio::AlignIO;
18 my $in = new Bio::AlignIO(-format => 'nexus',
19 -file => 'aln.nexus');
20 while( my $aln = $in->next_aln ) {
21 # do something with the alignment
24 =head1 DESCRIPTION
26 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
27 data blocks. See method documentation for supported NEXUS features.
29 =head1 ACKNOWLEDGEMENTS
31 Will Fisher has written an excellent standalone NEXUS format parser in
32 Perl, readnexus. A number of tricks were adapted from it.
34 =head1 FEEDBACK
36 =head2 Reporting Bugs
38 Report bugs to the Bioperl bug tracking system to help us keep track
39 the bugs and their resolution. Bug reports can be submitted via the
40 web:
42 http://bugzilla.open-bio.org/
44 =head1 AUTHORS - Heikki Lehvaslaiho
46 Email: heikki-at-bioperl-dot-org
48 =head1 APPENDIX
50 The rest of the documentation details each of the object
51 methods. Internal methods are usually preceded with a _
53 =cut
55 # Let the code begin...
57 package Bio::AlignIO::nexus;
58 use vars qw(%valid_type);
59 use strict;
60 no strict "refs";
63 use base qw(Bio::AlignIO);
65 BEGIN {
66 %valid_type = map {$_, 1} qw( dna rna protein standard );
67 # standard throws error: inherited from Bio::PrimarySeq
70 =head2 new
72 Title : new
73 Usage : $alignio = new Bio::AlignIO(-format => 'nexus', -file => 'filename');
74 Function: returns a new Bio::AlignIO object to handle clustalw files
75 Returns : Bio::AlignIO::clustalw object
76 Args : -verbose => verbosity setting (-1,0,1,2)
77 -file => name of file to read in or with ">" - writeout
78 -fh => alternative to -file param - provide a filehandle
79 to read from/write to
80 -format => type of Alignment Format to process or produce
82 Customization of nexus flavor output
84 -show_symbols => print the symbols="ATGC" in the data definition
85 (MrBayes does not like this)
86 boolean [default is 1]
87 -show_endblock => print an 'endblock;' at the end of the data
88 (MyBayes does not like this)
89 boolean [default is 1]
91 =cut
93 sub _initialize {
94 my ($self, @args) = @_;
95 $self->SUPER::_initialize(@args);
96 my ($show_symbols, $endblock) =
97 $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
98 my @names = qw(symbols endblock);
99 for my $v ( $show_symbols, $endblock ) {
100 $v = 1 unless defined $v; # default value is 1
101 my $n = shift @names;
102 $self->flag($n, $v);
107 =head2 next_aln
109 Title : next_aln
110 Usage : $aln = $stream->next_aln()
111 Function: Returns the next alignment in the stream.
113 Supports the following NEXUS format features:
114 - The file has to start with '#NEXUS'
115 - Reads in the name of the alignment from a comment
116 (anything after 'TITLE: ') .
117 - Sequence names can be given in a taxa block, too.
118 - If matchchar notation is used, converts
119 them back to sequence characters.
120 - Does character conversions specified in the
121 NEXUS equate command.
122 - Sequence names of type 'Homo sapiens' and
123 Homo_sapiens are treated identically.
125 Returns : L<Bio::Align::AlignI> object
126 Args :
129 =cut
132 sub next_aln {
133 my $self = shift;
134 my $entry;
135 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
136 $match, $gap, $missing, $equate, $interleave,
137 $name,$str,@names,$seqname,$start,$end,$count,$seq);
139 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
141 # file starts with '#NEXUS' but we allow white space only lines before it
142 $entry = $self->_readline;
143 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
145 return unless $entry;
146 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
147 unless ($entry && $entry =~ /^#NEXUS/i);
149 # skip anything before either the taxa or data block
150 # but read in the optional title in a comment
151 while (defined($entry = $self->_readline)) {
152 local ($_) = $entry;
153 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
154 last if /^begin +data/i || /^begin +taxa/i;
156 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
158 # data and taxa blocks
159 my $incomment;
160 while (defined ($entry = $self->_readline)) {
161 local ($_) = $entry;
162 next if s/\[[^\]]+\]//g; # remove comments
163 if( s/\[[^\]]+$// ) {
164 $incomment = 1;
165 # skip line if it is now empty or contains only whitespace
166 next if /^\s*$/;
167 } elsif($incomment) {
168 if( s/^[^\]]*\]// ) {
169 $incomment = 0;
170 } else {
171 next;
173 } elsif( /taxlabels/i ) {
174 # doesn't deal with taxlabels adequately and can mess things up!
175 # @names = $self->_read_taxlabels;
176 } else {
178 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
179 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
180 /matchchar\s*=\s*(.)/i and $match = $1;
181 /gap\s*=\s*(.)/i and $gap = $1;
182 /missing\s*=\s*(.)/i and $missing = $1;
183 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
184 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
185 /interleave/i and $interleave = 1 ;
186 last if /matrix/io;
189 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
190 unless $alphabet;
191 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
192 unless $valid_type{$alphabet};
193 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
194 if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
195 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
196 if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
198 $aln->gap_char($gap);
199 $aln->missing_char($missing);
202 # if data is not right after the matrix line
203 # read the empty lines out
205 while ($entry = $self->_readline) {
206 unless ($entry =~ /^\s+$/) {
207 $self->_pushback($entry);
208 last;
213 # matrix command
215 # first alignment section
216 if (@names == 0) { # taxa block did not exist
217 while ($entry = $self->_readline) {
218 local ($_) = $entry;
219 if( s/\[[^\]]+\]//g ) { #] remove comments
220 next if /^\s*$/;
221 # skip line if it is now empty or contains only whitespace
223 if ($interleave && defined$count && ($count <= $seqcount)) {
224 /^\s+$/ and last;
225 } else {
226 /^\s+$/ and next;
228 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
229 #/^\s*;\s*$/ and last;
230 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
231 # get single and double quoted names, or all the first
232 # nonwhite word as the name, and remained is seq
233 #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
234 $name = ($2 || $3);
235 if ($4) {
236 # seq is on same line as name
237 # this is the usual NEXUS format
238 $str = $4;
239 } else {
240 # otherwise get seq from following lines. No comments allowed
241 # a less common matrix format, usually used for very long seqs
242 $str='';
243 while (local ($_) = $self->_readline) {
244 my $str_tmp = $_;
245 $str_tmp =~ s/[\s;]//g;
246 $str .= $str_tmp;
247 last if length$str == $residuecount;
250 $name =~ s/ /_/g;
251 push @names, $name;
253 $str =~ s/[\s;]//g;
254 $count = @names;
255 $hash{$count} = $str;
257 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
258 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
262 # interleaved sections
263 $count = 0;
264 if ( $interleave ) { # only read next section if file is interleaved
265 while( $entry = $self->_readline) {
266 local ($_) = $entry;
267 if( s/\[[^\]]+\]//g ) { #] remove comments
268 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
270 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
271 $count = 0, next if $entry =~ /^\s*$/;
272 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
273 $str = $4;
274 $str =~ s/[\s;]//g;
275 $count++;
276 $hash{$count} .= $str;
278 $self->throw("Not a valid interleaved NEXUS file!
279 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
280 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
284 return 0 if @names < 1;
286 # sequence creation
287 $count = 0;
288 foreach $name ( @names ) {
289 $count++;
290 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
291 $seqname = $1;
292 $start = $2;
293 $end = $3;
294 } else {
295 $seqname=$name;
296 $start = 1;
297 $str = $hash{$count};
298 $str =~ s/[^A-Za-z]//g;
299 $end = length($str);
302 # consistency test
303 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ")
304 unless CORE::length($hash{$count}) == $residuecount;
306 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count},
307 '-id'=>$seqname,
308 '-start'=>$start,
309 '-end'=>$end,
310 'alphabet'=>$alphabet
312 $aln->add_seq($seq);
315 # if matchchar is used
316 $aln->unmatch($match) if $match;
318 # if equate ( e.g. equate="T=C G=A") is used
319 if ($equate) {
320 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
323 while (defined $entry &&
324 $entry !~ /endblock/i) {
325 $entry = $self->_readline;
328 return $aln if $aln->no_sequences;
329 return;
332 sub _read_taxlabels {
333 my ($self) = @_;
334 my ($name, @names);
335 while (my $entry = $self->_readline) {
336 last if $entry =~ m/^\s*(END)?;/i;
337 if( $entry =~ m/\s*(\S+)\s+/ ) {
338 ($name) = ($1);
339 $name =~ s/\[[^\[]+\]//g;
340 $name =~ s/\W/_/g;
341 push @names, $name;
344 return @names;
347 =head2 write_aln
349 Title : write_aln
350 Usage : $stream->write_aln(@aln)
351 Function: Writes the $aln object into the stream in interleaved NEXUS
352 format. Everything is written into a data block.
353 SimpleAlign methods match_char, missing_char and gap_char must be set
354 if you want to see them in the output.
355 Returns : 1 for success and 0 for error
356 Args : L<Bio::Align::AlignI> object
358 =cut
360 sub write_aln {
361 my ($self,@aln) = @_;
362 my $count = 0;
363 my $wrapped = 0;
364 my $maxname;
365 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
366 my ($match, $missing, $gap,$symbols) = ('', '', '','');
368 foreach my $aln (@aln) {
369 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
370 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
371 next;
373 $self->throw("All sequences in the alignment must be the same length")
374 unless $aln->is_flush($self->verbose);
376 $length = $aln->length();
378 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
379 $aln->id, $aln->no_sequences, $length));
380 $match = "match=". $aln->match_char if $aln->match_char;
381 $missing = "missing=". $aln->missing_char if $aln->missing_char;
382 $gap = "gap=". $aln->gap_char if $aln->gap_char;
384 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
385 if( $self->flag('symbols') && $aln->symbol_chars);
386 $self->_print
387 (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
388 $aln->get_seq_by_pos(1)->alphabet, $match,
389 $missing, $gap, $symbols));
391 # account for single quotes round names
392 my $indent = $aln->maxdisplayname_length+2;
394 $aln->set_displayname_flat();
395 foreach $seq ( $aln->each_seq() ) {
396 my $nmid = $aln->displayname($seq->get_nse());
397 if( $nmid =~ /[^\w\d]/ ) {
398 # put name in single quotes incase it contains any of
399 # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
400 # allowed in PAUP* and possible other software
402 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
403 } else {
404 $name = sprintf("%-${indent}s", $nmid);
406 $hash{$name} = $seq->seq;
407 push(@arr,$name);
410 while( $count < $length ) {
411 # there is another block to go!
412 foreach $name ( @arr ) {
413 my $dispname = $name;
414 # $dispname = '' if $wrapped;
415 $self->_print (sprintf("%${indent}s ",$dispname));
416 $tempcount = $count;
417 $index = 0;
418 while( ($tempcount + 10 < $length) && ($index < 5) ) {
419 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
420 $tempcount += 10;
421 $index++;
423 # last
424 if( $index < 5) {
425 # space to print!
426 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
427 $tempcount += 10;
429 $self->_print ("\n");
431 $self->_print ("\n\n");
432 $count = $tempcount;
433 $wrapped = 1;
435 if( $self->flag('endblock') ) {
436 $self->_print (";\n\nendblock;\n");
437 } else {
438 $self->_print (";\n\nend;\n");
441 $self->flush if $self->_flush_on_write && defined $self->_fh;
442 return 1;
445 =head2 flag
447 Title : flag
448 Usage : $obj->flag($name,$value)
449 Function: Get/Set a flag value
450 Returns : value of flag (a scalar)
451 Args : on set, new value (a scalar or undef, optional)
454 =cut
456 sub flag{
457 my ($self,$name,$val) = @_;
458 return $self->{'flag'}->{$name} = $val if defined $val;
459 return $self->{'flag'}->{$name};