Bio/AlignIO/nexus.pm

   1 #
   2 # BioPerl module for Bio::AlignIO::nexus
   3 #
   4 # Copyright Heikki Lehvaslaiho
   5 #
   6
   7 =head1 NAME
   8
   9 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
  10
  11 =head1 SYNOPSIS
  12
  13 Do not use this module directly.  Use it via the L<Bio::AlignIO> class.
  14
  15     use Bio::AlignIO;
  16
  17     my $in = Bio::AlignIO->new(-format => 'nexus',
  18                               -file   => 'aln.nexus');
  19     while( my $aln = $in->next_aln ) {
  20         # do something with the alignment
  21     }
  22
  23 =head1 DESCRIPTION
  24
  25 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
  26 data blocks. See method documentation for supported NEXUS features.
  27
  28 =head1 ACKNOWLEDGEMENTS
  29
  30 Will Fisher has written an excellent standalone NEXUS format parser in
  31 Perl, readnexus. A number of tricks were adapted from it.
  32
  33 =head1 FEEDBACK
  34
  35 =head2 Support
  36
  37 Please direct usage questions or support issues to the mailing list:
  38
  39 I<bioperl-l@bioperl.org>
  40
  41 rather than to the module maintainer directly. Many experienced and
  42 reponsive experts will be able look at the problem and quickly
  43 address it. Please include a thorough description of the problem
  44 with code and data examples if at all possible.
  45
  46 =head2 Reporting Bugs
  47
  48 Report bugs to the Bioperl bug tracking system to help us keep track
  49 the bugs and their resolution.  Bug reports can be submitted via the
  50 web:
  51
  52   https://github.com/bioperl/bioperl-live/issues
  53
  54 =head1 AUTHORS - Heikki Lehvaslaiho
  55
  56 Email: heikki-at-bioperl-dot-org
  57
  58 =head1 APPENDIX
  59
  60 The rest of the documentation details each of the object
  61 methods. Internal methods are usually preceded with a _
  62
  63 =cut
  64
  65 # Let the code begin...
  66
  67 package Bio::AlignIO::nexus;
  68 use vars qw(%valid_type);
  69 use strict;
  70 no strict "refs";
  71
  72
  73 use base qw(Bio::AlignIO);
  74
  75 BEGIN {
  76     %valid_type = map {$_, 1} qw( dna rna protein standard );
  77     # standard throws error: inherited from Bio::PrimarySeq
  78 }
  79
  80 =head2 new
  81
  82  Title   : new
  83  Usage   : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
  84  Function: returns a new Bio::AlignIO object to handle clustalw files
  85  Returns : Bio::AlignIO::clustalw object
  86  Args    : -verbose => verbosity setting (-1,0,1,2)
  87            -file    => name of file to read in or with ">" - writeout
  88            -fh      => alternative to -file param - provide a filehandle
  89                        to read from/write to
  90            -format  => type of Alignment Format to process or produce
  91
  92            Customization of nexus flavor output
  93
  94            -show_symbols => print the symbols="ATGC" in the data definition
  95                             (MrBayes does not like this)
  96                             boolean [default is 1]
  97            -show_endblock => print an 'endblock;' at the end of the data
  98                             (MyBayes does not like this)
  99                             boolean [default is 1]
 100
 101 =cut
 102
 103 sub _initialize {
 104     my ($self, @args) = @_;
 105     $self->SUPER::_initialize(@args);
 106     my ($show_symbols, $endblock) =
 107         $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
 108     my @names = qw(symbols endblock);
 109     for my $v ( $show_symbols, $endblock ) {
 110         $v = 1 unless defined $v; # default value is 1
 111         my $n = shift @names;
 112         $self->flag($n, $v);
 113     }
 114 }
 115
 116
 117 =head2 next_aln
 118
 119  Title   : next_aln
 120  Usage   : $aln = $stream->next_aln()
 121  Function: Returns the next alignment in the stream.
 122
 123            Supports the following NEXUS format features:
 124            - The file has to start with '#NEXUS'
 125            - Reads in the name of the alignment from a comment
 126              (anything after 'TITLE: ') .
 127            - Sequence names can be given in a taxa block, too.
 128            - If matchchar notation is used, converts
 129              them back to sequence characters.
 130            - Does character conversions specified in the
 131              NEXUS equate command.
 132            - Sequence names of type 'Homo sapiens' and
 133              Homo_sapiens are treated identically.
 134
 135  Returns : L<Bio::Align::AlignI> object
 136  Args    :
 137
 138
 139 =cut
 140
 141
 142 sub next_aln {
 143     my $self = shift;
 144     my $entry;
 145     my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
 146         $match, $gap, $missing, $equate, $interleave,
 147         $name,$str,@names,$seqname,$start,$end,$count,$seq);
 148     local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
 149     local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
 150     my $aln =  Bio::SimpleAlign->new(-source => 'nexus');
 151
 152     # file starts with '#NEXUS' but we allow white space only lines before it
 153     $entry = $self->_readline;
 154     $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
 155
 156     return unless $entry;
 157     $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
 158         unless ($entry && $entry =~ /^#NEXUS/i);
 159
 160     # skip anything before either the taxa or data block
 161     # but read in the optional title in a comment
 162     while (defined($entry = $self->_readline)) {
 163         local ($_) = $entry;
 164         /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
 165         last if /^begin +data/i || /^begin +taxa/i;
 166     }
 167     $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
 168
 169     # data and taxa blocks
 170     my $incomment;
 171     while (defined ($entry = $self->_readline)) {
 172         local ($_) =  $entry;
 173         next if s/\[[^\]]+\]//g; # remove comments
 174         if( s/\[[^\]]+$// ) {
 175             $incomment = 1;
 176                  # skip line if it is now empty or contains only whitespace
 177             next if /^\s*$/;
 178         } elsif($incomment) {
 179             if( s/^[^\]]*\]// ) {
 180                          $incomment = 0;
 181             } else {
 182                          next;
 183             }
 184         } elsif( /taxlabels/i ) {
 185             # doesn't deal with taxlabels adequately and can mess things up!
 186             # @names = $self->_read_taxlabels;
 187         } else {
 188
 189             /ntax\s*=\s*(\d+)/i        and $seqcount = $1;
 190             /nchar\s*=\s*(\d+)/i       and $residuecount = $1;
 191             /matchchar\s*=\s*(.)/i     and $match = $1;
 192             /gap\s*=\s*(.)/i           and $gap = $1;
 193             /missing\s*=\s*(.)/i       and $missing = $1;
 194             /equate\s*=\s*\"([^\"]+)/i and $equate = $1;  # "e.g. equate="T=C G=A";
 195             /datatype\s*=\s*(\w+)/i    and $alphabet = lc $1;
 196             /interleave/i              and $interleave = 1 ;
 197             last if /matrix/io;
 198         }
 199     }
 200     $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
 201         unless $alphabet;
 202     $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
 203         unless $valid_type{$alphabet};
 204     $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
 205         if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
 206     $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
 207         if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
 208
 209     $aln->gap_char($gap);
 210     $aln->missing_char($missing);
 211
 212     #
 213     # if data is not right after the matrix line
 214     #  read the empty lines out
 215     #
 216     while ($entry = $self->_readline) {
 217         unless ($entry =~ /^\s+$/) {
 218             $self->_pushback($entry);
 219             last;
 220         }
 221     }
 222
 223     #
 224     # matrix command
 225     #
 226     # first alignment section
 227     if (@names == 0) {          # taxa block did not exist
 228         while ($entry = $self->_readline) {
 229                 local ($_) =  $entry;
 230                 if( s/\[[^\]]+\]//g ) { #]  remove comments
 231                         next if /^\s*$/;
 232                         # skip line if it is now empty or contains only whitespace
 233                 }
 234                 if ($interleave && defined$count && ($count <= $seqcount)) {
 235                         /^\s+$/ and last;
 236                 } else {
 237                         /^\s+$/ and next;
 238                 }
 239                 /^\s*;/ and last;       # stop if colon at end of matrix is on it's own line
 240                 #/^\s*;\s*$/ and last;
 241                 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
 242                         # get single and double quoted names, or all the first
 243          # nonwhite word as the name, and remained is seq
 244                         #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
 245                         $name = ($2 || $3);
 246                         if  ($4) {
 247                                 # seq is on same line as name
 248                                 # this is the usual NEXUS format
 249                                 $str = $4;
 250                         } else {
 251                                 # otherwise get seq from following lines. No comments allowed
 252                                 # a less common matrix format, usually used for very long seqs
 253                                 $str='';
 254                                 while (local ($_) = $self->_readline) {
 255                                         my $str_tmp = $_;
 256                                         $str_tmp =~ s/[\s;]//g;
 257                                         $str .= $str_tmp;
 258                                         last if length$str == $residuecount;
 259                                 }
 260                         }
 261                         $name =~ s/ /_/g;
 262                         push @names, $name;
 263
 264                         $str =~ s/[\s;]//g;
 265                         $count =  @names;
 266                         $hash{$count} = $str;
 267                 }
 268                 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
 269                 /;/ and last;   # stop if colon at end of matrix is on the same line as the last seq
 270         }
 271 }
 272
 273     # interleaved sections
 274     $count = 0;
 275     if ( $interleave ) {        # only read next section if file is interleaved
 276         while( $entry = $self->_readline) {
 277             local ($_) =  $entry;
 278             if( s/\[[^\]]+\]//g ) { #]  remove comments
 279                 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
 280             }
 281             /^\s*;/ and last;           # stop if colon at end of matrix is on it's own line
 282             $count = 0, next if $entry =~ /^\s*$/;
 283             if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
 284                 $str = $4;
 285                 $str =~ s/[\s;]//g;
 286                 $count++;
 287                 $hash{$count} .= $str;
 288             };
 289             $self->throw("Not a valid interleaved NEXUS file!
 290                 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
 291             /;/ and last;       # stop if colon at end of matrix is on the same line as the last seq
 292         }
 293     }
 294
 295     return if @names < 1;
 296
 297     # sequence creation
 298     $count = 0;
 299     foreach $name ( @names ) {
 300         $count++;
 301         if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
 302             ($seqname,$start,$end) = ($1,$2,$3);
 303         } else {
 304             ($seqname,$start,$str) = ($name,1,$hash{$count});
 305             $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
 306             $end = length($str);
 307         }
 308
 309         # consistency test
 310         $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
 311             unless CORE::length($hash{$count}) == $residuecount;
 312
 313         $seq = Bio::LocatableSeq->new('-seq'        => $hash{$count},
 314                                       '-display_id' => $seqname,
 315                                       '-start'      => $start,
 316                                       '-end'        => $end,
 317                                      '-alphabet'    => $alphabet
 318                                       );
 319         $aln->add_seq($seq);
 320     }
 321
 322     # if matchchar is used
 323     $aln->unmatch($match) if $match;
 324
 325     # if equate ( e.g. equate="T=C G=A") is used
 326     if ($equate) {
 327         $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
 328     }
 329
 330     while  (defined $entry &&
 331             $entry !~ /endblock/i) {
 332         $entry = $self->_readline;
 333     }
 334
 335     return $aln if $aln->num_sequences;
 336         return;
 337 }
 338
 339 sub _read_taxlabels {
 340     my ($self) = @_;
 341     my ($name, @names);
 342     while (my $entry = $self->_readline) {
 343         last if $entry =~ m/^\s*(END)?;/i;
 344         if( $entry =~ m/\s*(\S+)\s+/ ) {
 345             ($name) = ($1);
 346             $name =~ s/\[[^\[]+\]//g;
 347             $name =~ s/\W/_/g;
 348             push @names, $name;
 349         }
 350     }
 351     return @names;
 352 }
 353
 354 =head2 write_aln
 355
 356  Title   : write_aln
 357  Usage   : $stream->write_aln(@aln)
 358  Function: Writes the $aln object into the stream in interleaved NEXUS
 359            format. Everything is written into a data block.
 360            SimpleAlign methods match_char, missing_char and gap_char must be set
 361            if you want to see them in the output.
 362  Returns : 1 for success and 0 for error
 363  Args    : L<Bio::Align::AlignI> object
 364
 365 =cut
 366
 367 sub write_aln {
 368     my ($self,@aln) = @_;
 369     my $count = 0;
 370     my $wrapped = 0;
 371     my $maxname;
 372     my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
 373     my ($match, $missing, $gap,$symbols) = ('', '', '','');
 374
 375     foreach my $aln (@aln) {
 376         if( ! $aln || ! $aln->isa('Bio::Align::AlignI')  ) {
 377             $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
 378             next;
 379         }
 380         $self->throw("All sequences in the alignment must be the same length")
 381             unless $aln->is_flush($self->verbose);
 382
 383         $length  = $aln->length();
 384
 385         $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
 386                                $aln->id, $aln->num_sequences, $length));
 387         $match = "match=". $aln->match_char if $aln->match_char;
 388         $missing = "missing=". $aln->missing_char if $aln->missing_char;
 389         $gap = "gap=". $aln->gap_char if $aln->gap_char;
 390
 391         $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
 392             if( $self->flag('symbols') && $aln->symbol_chars);
 393         $self->_print
 394             (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
 395                      $aln->get_seq_by_pos(1)->alphabet, $match,
 396                      $missing, $gap, $symbols));
 397
 398                      # account for single quotes round names
 399         my $indent = $aln->maxdisplayname_length+2;
 400
 401         $aln->set_displayname_flat();
 402         foreach $seq ( $aln->each_seq() ) {
 403             my $nmid = $aln->displayname($seq->get_nse());
 404             if( $nmid =~ /[^\w\d\.]/ ) {
 405               # put name in single quotes incase it contains any of
 406               # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
 407               # allowed in PAUP* and possible other software
 408
 409                 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
 410             } else {
 411                 $name = sprintf("%-${indent}s", $nmid);
 412             }
 413             $hash{$name} = $seq->seq;
 414             push(@arr,$name);
 415         }
 416
 417         while( $count < $length ) {
 418             # there is another block to go!
 419             foreach $name ( @arr ) {
 420                 my $dispname = $name;
 421 #                $dispname = '' if $wrapped;
 422                 $self->_print (sprintf("%${indent}s  ",$dispname));
 423                 $tempcount = $count;
 424                 $index = 0;
 425                 while( ($tempcount + 10 < $length) && ($index < 5)  ) {
 426                     $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
 427                     $tempcount += 10;
 428                     $index++;
 429                 }
 430                 # last
 431                 if( $index < 5) {
 432                     # space to print!
 433                     $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
 434                     $tempcount += 10;
 435                 }
 436                 $self->_print ("\n");
 437             }
 438             $self->_print ("\n\n");
 439             $count = $tempcount;
 440             $wrapped = 1;
 441         }
 442         if( $self->flag('endblock') ) {
 443             $self->_print (";\n\nendblock;\n");
 444         } else {
 445             $self->_print (";\n\nend;\n");
 446         }
 447     }
 448     $self->flush if $self->_flush_on_write && defined $self->_fh;
 449     return 1;
 450 }
 451
 452 =head2 flag
 453
 454  Title   : flag
 455  Usage   : $obj->flag($name,$value)
 456  Function: Get/Set a flag value
 457  Returns : value of flag (a scalar)
 458  Args    : on set, new value (a scalar or undef, optional)
 459
 460
 461 =cut
 462
 463 sub flag{
 464     my ($self,$name,$val) = @_;
 465     return $self->{'flag'}->{$name} = $val if defined $val;
 466     return $self->{'flag'}->{$name};
 467 }
 468
 469 1;