Sync README and README.md
[bioperl-live.git] / Bio / AlignIO / nexus.pm
blob2364f7a20b895a7865b3b62114c97cfeae07c09a
2 # BioPerl module for Bio::AlignIO::nexus
4 # Copyright Heikki Lehvaslaiho
7 =head1 NAME
9 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
11 =head1 SYNOPSIS
13 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
15 use Bio::AlignIO;
17 my $in = Bio::AlignIO->new(-format => 'nexus',
18 -file => 'aln.nexus');
19 while( my $aln = $in->next_aln ) {
20 # do something with the alignment
23 =head1 DESCRIPTION
25 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
26 data blocks. See method documentation for supported NEXUS features.
28 =head1 ACKNOWLEDGEMENTS
30 Will Fisher has written an excellent standalone NEXUS format parser in
31 Perl, readnexus. A number of tricks were adapted from it.
33 =head1 FEEDBACK
35 =head2 Support
37 Please direct usage questions or support issues to the mailing list:
39 I<bioperl-l@bioperl.org>
41 rather than to the module maintainer directly. Many experienced and
42 reponsive experts will be able look at the problem and quickly
43 address it. Please include a thorough description of the problem
44 with code and data examples if at all possible.
46 =head2 Reporting Bugs
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 the bugs and their resolution. Bug reports can be submitted via the
50 web:
52 https://github.com/bioperl/bioperl-live/issues
54 =head1 AUTHORS - Heikki Lehvaslaiho
56 Email: heikki-at-bioperl-dot-org
58 =head1 APPENDIX
60 The rest of the documentation details each of the object
61 methods. Internal methods are usually preceded with a _
63 =cut
65 # Let the code begin...
67 package Bio::AlignIO::nexus;
68 use vars qw(%valid_type);
69 use strict;
70 no strict "refs";
73 use base qw(Bio::AlignIO);
75 BEGIN {
76 %valid_type = map {$_, 1} qw( dna rna protein standard );
77 # standard throws error: inherited from Bio::PrimarySeq
80 =head2 new
82 Title : new
83 Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
84 Function: returns a new Bio::AlignIO object to handle clustalw files
85 Returns : Bio::AlignIO::clustalw object
86 Args : -verbose => verbosity setting (-1,0,1,2)
87 -file => name of file to read in or with ">" - writeout
88 -fh => alternative to -file param - provide a filehandle
89 to read from/write to
90 -format => type of Alignment Format to process or produce
92 Customization of nexus flavor output
94 -show_symbols => print the symbols="ATGC" in the data definition
95 (MrBayes does not like this)
96 boolean [default is 1]
97 -show_endblock => print an 'endblock;' at the end of the data
98 (MyBayes does not like this)
99 boolean [default is 1]
101 =cut
103 sub _initialize {
104 my ($self, @args) = @_;
105 $self->SUPER::_initialize(@args);
106 my ($show_symbols, $endblock) =
107 $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
108 my @names = qw(symbols endblock);
109 for my $v ( $show_symbols, $endblock ) {
110 $v = 1 unless defined $v; # default value is 1
111 my $n = shift @names;
112 $self->flag($n, $v);
117 =head2 next_aln
119 Title : next_aln
120 Usage : $aln = $stream->next_aln()
121 Function: Returns the next alignment in the stream.
123 Supports the following NEXUS format features:
124 - The file has to start with '#NEXUS'
125 - Reads in the name of the alignment from a comment
126 (anything after 'TITLE: ') .
127 - Sequence names can be given in a taxa block, too.
128 - If matchchar notation is used, converts
129 them back to sequence characters.
130 - Does character conversions specified in the
131 NEXUS equate command.
132 - Sequence names of type 'Homo sapiens' and
133 Homo_sapiens are treated identically.
135 Returns : L<Bio::Align::AlignI> object
136 Args :
139 =cut
142 sub next_aln {
143 my $self = shift;
144 my $entry;
145 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
146 $match, $gap, $missing, $equate, $interleave,
147 $name,$str,@names,$seqname,$start,$end,$count,$seq);
148 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
149 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
150 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
152 # file starts with '#NEXUS' but we allow white space only lines before it
153 $entry = $self->_readline;
154 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
156 return unless $entry;
157 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
158 unless ($entry && $entry =~ /^#NEXUS/i);
160 # skip anything before either the taxa or data block
161 # but read in the optional title in a comment
162 while (defined($entry = $self->_readline)) {
163 local ($_) = $entry;
164 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
165 last if /^begin +data/i || /^begin +taxa/i;
167 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
169 # data and taxa blocks
170 my $incomment;
171 while (defined ($entry = $self->_readline)) {
172 local ($_) = $entry;
173 next if s/\[[^\]]+\]//g; # remove comments
174 if( s/\[[^\]]+$// ) {
175 $incomment = 1;
176 # skip line if it is now empty or contains only whitespace
177 next if /^\s*$/;
178 } elsif($incomment) {
179 if( s/^[^\]]*\]// ) {
180 $incomment = 0;
181 } else {
182 next;
184 } elsif( /taxlabels/i ) {
185 # doesn't deal with taxlabels adequately and can mess things up!
186 # @names = $self->_read_taxlabels;
187 } else {
189 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
190 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
191 /matchchar\s*=\s*(.)/i and $match = $1;
192 /gap\s*=\s*(.)/i and $gap = $1;
193 /missing\s*=\s*(.)/i and $missing = $1;
194 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
195 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
196 /interleave/i and $interleave = 1 ;
197 last if /matrix/io;
200 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
201 unless $alphabet;
202 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
203 unless $valid_type{$alphabet};
204 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
205 if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
206 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
207 if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
209 $aln->gap_char($gap);
210 $aln->missing_char($missing);
213 # if data is not right after the matrix line
214 # read the empty lines out
216 while ($entry = $self->_readline) {
217 unless ($entry =~ /^\s+$/) {
218 $self->_pushback($entry);
219 last;
224 # matrix command
226 # first alignment section
227 if (@names == 0) { # taxa block did not exist
228 while ($entry = $self->_readline) {
229 local ($_) = $entry;
230 if( s/\[[^\]]+\]//g ) { #] remove comments
231 next if /^\s*$/;
232 # skip line if it is now empty or contains only whitespace
234 if ($interleave && defined$count && ($count <= $seqcount)) {
235 /^\s+$/ and last;
236 } else {
237 /^\s+$/ and next;
239 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
240 #/^\s*;\s*$/ and last;
241 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
242 # get single and double quoted names, or all the first
243 # nonwhite word as the name, and remained is seq
244 #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
245 $name = ($2 || $3);
246 if ($4) {
247 # seq is on same line as name
248 # this is the usual NEXUS format
249 $str = $4;
250 } else {
251 # otherwise get seq from following lines. No comments allowed
252 # a less common matrix format, usually used for very long seqs
253 $str='';
254 while (local ($_) = $self->_readline) {
255 my $str_tmp = $_;
256 $str_tmp =~ s/[\s;]//g;
257 $str .= $str_tmp;
258 last if length$str == $residuecount;
261 $name =~ s/ /_/g;
262 push @names, $name;
264 $str =~ s/[\s;]//g;
265 $count = @names;
266 $hash{$count} = $str;
268 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
269 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
273 # interleaved sections
274 $count = 0;
275 if ( $interleave ) { # only read next section if file is interleaved
276 while( $entry = $self->_readline) {
277 local ($_) = $entry;
278 if( s/\[[^\]]+\]//g ) { #] remove comments
279 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
281 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
282 $count = 0, next if $entry =~ /^\s*$/;
283 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
284 $str = $4;
285 $str =~ s/[\s;]//g;
286 $count++;
287 $hash{$count} .= $str;
289 $self->throw("Not a valid interleaved NEXUS file!
290 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
291 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
295 return if @names < 1;
297 # sequence creation
298 $count = 0;
299 foreach $name ( @names ) {
300 $count++;
301 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
302 ($seqname,$start,$end) = ($1,$2,$3);
303 } else {
304 ($seqname,$start,$str) = ($name,1,$hash{$count});
305 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
306 $end = length($str);
309 # consistency test
310 $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
311 unless CORE::length($hash{$count}) == $residuecount;
313 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
314 '-display_id' => $seqname,
315 '-start' => $start,
316 '-end' => $end,
317 '-alphabet' => $alphabet
319 $aln->add_seq($seq);
322 # if matchchar is used
323 $aln->unmatch($match) if $match;
325 # if equate ( e.g. equate="T=C G=A") is used
326 if ($equate) {
327 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
330 while (defined $entry &&
331 $entry !~ /endblock/i) {
332 $entry = $self->_readline;
335 return $aln if $aln->num_sequences;
336 return;
339 sub _read_taxlabels {
340 my ($self) = @_;
341 my ($name, @names);
342 while (my $entry = $self->_readline) {
343 last if $entry =~ m/^\s*(END)?;/i;
344 if( $entry =~ m/\s*(\S+)\s+/ ) {
345 ($name) = ($1);
346 $name =~ s/\[[^\[]+\]//g;
347 $name =~ s/\W/_/g;
348 push @names, $name;
351 return @names;
354 =head2 write_aln
356 Title : write_aln
357 Usage : $stream->write_aln(@aln)
358 Function: Writes the $aln object into the stream in interleaved NEXUS
359 format. Everything is written into a data block.
360 SimpleAlign methods match_char, missing_char and gap_char must be set
361 if you want to see them in the output.
362 Returns : 1 for success and 0 for error
363 Args : L<Bio::Align::AlignI> object
365 =cut
367 sub write_aln {
368 my ($self,@aln) = @_;
369 my $count = 0;
370 my $wrapped = 0;
371 my $maxname;
372 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
373 my ($match, $missing, $gap,$symbols) = ('', '', '','');
375 foreach my $aln (@aln) {
376 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
377 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
378 next;
380 $self->throw("All sequences in the alignment must be the same length")
381 unless $aln->is_flush($self->verbose);
383 $length = $aln->length();
385 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
386 $aln->id, $aln->num_sequences, $length));
387 $match = "match=". $aln->match_char if $aln->match_char;
388 $missing = "missing=". $aln->missing_char if $aln->missing_char;
389 $gap = "gap=". $aln->gap_char if $aln->gap_char;
391 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
392 if( $self->flag('symbols') && $aln->symbol_chars);
393 $self->_print
394 (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
395 $aln->get_seq_by_pos(1)->alphabet, $match,
396 $missing, $gap, $symbols));
398 # account for single quotes round names
399 my $indent = $aln->maxdisplayname_length+2;
401 $aln->set_displayname_flat();
402 foreach $seq ( $aln->each_seq() ) {
403 my $nmid = $aln->displayname($seq->get_nse());
404 if( $nmid =~ /[^\w\d\.]/ ) {
405 # put name in single quotes incase it contains any of
406 # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
407 # allowed in PAUP* and possible other software
409 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
410 } else {
411 $name = sprintf("%-${indent}s", $nmid);
413 $hash{$name} = $seq->seq;
414 push(@arr,$name);
417 while( $count < $length ) {
418 # there is another block to go!
419 foreach $name ( @arr ) {
420 my $dispname = $name;
421 # $dispname = '' if $wrapped;
422 $self->_print (sprintf("%${indent}s ",$dispname));
423 $tempcount = $count;
424 $index = 0;
425 while( ($tempcount + 10 < $length) && ($index < 5) ) {
426 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
427 $tempcount += 10;
428 $index++;
430 # last
431 if( $index < 5) {
432 # space to print!
433 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
434 $tempcount += 10;
436 $self->_print ("\n");
438 $self->_print ("\n\n");
439 $count = $tempcount;
440 $wrapped = 1;
442 if( $self->flag('endblock') ) {
443 $self->_print (";\n\nendblock;\n");
444 } else {
445 $self->_print (";\n\nend;\n");
448 $self->flush if $self->_flush_on_write && defined $self->_fh;
449 return 1;
452 =head2 flag
454 Title : flag
455 Usage : $obj->flag($name,$value)
456 Function: Get/Set a flag value
457 Returns : value of flag (a scalar)
458 Args : on set, new value (a scalar or undef, optional)
461 =cut
463 sub flag{
464 my ($self,$name,$val) = @_;
465 return $self->{'flag'}->{$name} = $val if defined $val;
466 return $self->{'flag'}->{$name};