fix problems with remote data changes
[bioperl-live.git] / Bio / NexmlIO.pm
blob4c70d358f9e709ebfb293968b016568d0a351ecc
1 # $Id: Nexml.pm 15889 2009-07-29 13:35:29Z chmille4 $
2 # BioPerl module for Bio::NexmlIO
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chase Miller <chmille4@gmail.com>
8 # Copyright Chase Miller
10 # You may distribute this module under the same terms as perl itself
12 # _history
13 # June 16, 2009 Largely rewritten by Chase Miller
15 # POD documentation - main docs before the code
17 =head1 NAME
19 Bio::NexmlIO - stream handler for NeXML documents
21 =head1 SYNOPSIS
23 #Instantiate a Bio::Nexml object and link it to a file
24 my $in_nexml = Bio::Nexml->new(-file => 'nexml_doc.xml', -format => 'Nexml');
26 #Read in some data
27 my $bptree1 = $in_nexml->next_tree();
28 my $bpaln1 = $in_nexml->next_aln();
29 my $bpseq1 = $in_nexml->next_seq();
31 #Use/manipulate data
32 ...
34 #Write data to nexml file
35 my $out_nexml = Bio::Nexml->new(-file => '>new_nexml_doc.xml', -format => 'Nexml');
36 $out_nexml->to_xml();
40 =head1 DESCRIPTION
42 Bio::NexmlIO is an I/O handler for a NeXML document. A NeXML document can
43 represent three different data types: simple sequences, alignments,
44 and trees. NexmlIO has four main methods next_tree, next_seq,
45 next_aln, and write. NexmlIO returns bioperl seq, tree, and aln
46 objects which can be manipulated then passed to the write method of a
47 new NexmlIO instance to allow the creation of a NeXML document.
49 Each bioperl object contains all the information necessary to recreate
50 a Bio::Phylo::Taxa object, so each time a bioperl object is converted
51 to a biophylo object, the bioperl object is checked to see if its
52 associated taxa has already been created (against a hash using the
53 NexmlIO_ID and Taxa_ID to create a unique string). If not, it is
54 created; if so, that taxa object is used to link the Bio::Phylo tree
55 or matrix.
57 For more information on the NeXML format, see L<http://www.nexml.org>.
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists.
67 Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 =head2 Support
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
83 =head2 Reporting Bugs
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
87 web:
89 https://redmine.open-bio.org/projects/bioperl/
91 =head1 AUTHOR - Chase Miller
93 Email chmille4@gmail.com
95 =head1 CONTRIBUTORS
97 Mark A. Jensen, maj -at- fortinbras -dot- com
99 =head1 APPENDIX
101 The rest of the documentation details each of the object
102 methods. Internal methods are usually preceded with a _
104 =cut
106 # Let the code begin...
109 package Bio::NexmlIO;
110 use strict;
111 #TODO Change this
112 use lib '..';
114 use Bio::SeqIO::nexml;
115 use Bio::AlignIO::nexml;
116 use Bio::TreeIO::nexml;
117 use Bio::Nexml::Factory;
119 use base qw(Bio::Root::IO);
121 my $nexml_fac = Bio::Nexml::Factory->new();
123 =head1 CONSTRUCTOR
125 =head2 new
127 Title : new
128 Usage : my $in_nexmlIO = Bio::NexmlIO->new(-file => 'data.nexml.xml');
129 Function: Creates a L<Bio::NexmlIO> object linked to a stream
130 Returns : a L<Bio::NexmlIO> object
131 Args : file name
133 See L<Bio::Root::IO>
135 =cut
137 sub new {
138 my($class,@args) = @_;
139 my $self = $class->SUPER::new(@args);
141 my %params = @args;
142 my $file_string = $params{'-file'};
144 #create unique ID by creating a scalar and using the memory address
145 my $ID = bless \(my $dummy), "UniqueID";
146 ($self->{'_ID'}) = sprintf("%s",\$ID) =~ /(0x[0-9a-fA-F]+)/;
148 unless ($file_string =~ m/^\>/) {
149 $self->{'_doc'} = Bio::Phylo::IO->parse('-file' => $params{'-file'}, '-format' => 'nexml', '-as_project' => '1');
153 return $self;
156 =head2 doc
158 Title : doc
159 Usage : my $nexml_doc = $in_nexmlIO->doc();
160 Function: returns a L<Bio::Phylo::Project> object that contains all the Bio::Phylo data objects parsed from the stream
161 Returns : a L<Bio::Phylo::Project> object
162 Args : none
164 =cut
166 sub doc {
167 my $self = shift;
168 return $self->{'_doc'};
171 # Takes the Bio::Phylo::Project object and creats BioPerl trees, alns, and seqs from it
172 sub _parse {
173 my ($self) = @_;
175 $self->{'_treeiter'} = 0;
176 $self->{'_seqiter'} = 0;
177 $self->{'_alniter'} = 0;
179 $self->{_trees} = $nexml_fac->create_bperl_tree($self);
180 $self->{_alns} = $nexml_fac->create_bperl_aln($self);
181 $self->{_seqs} = $nexml_fac->create_bperl_seq($self);
182 my $taxa_array = $self->doc->get_taxa();
184 $self->{'_parsed'} = 1; #success
187 =head1 ITERATORS
189 =head2 next_tree
191 Title : next_tree
192 Usage : $tree = $stream->next_tree
193 Function: Reads the next tree object from the stream and returns it.
194 Returns : a L<Bio::Tree::Tree> object
195 Args : none
197 See L<Bio::Root::IO>, L<Bio::Tree::Tree>
199 =cut
201 sub next_tree {
202 my $self = shift;
203 $self->_parse unless $self->{'_parsed'};
205 return $self->{'_trees'}->[ $self->{'_treeiter'}++ ];
208 =head2 next_seq
210 Title : next_seq
211 Usage : $seq = $stream->next_seq
212 Function: Reads the next seq object from the stream and returns it.
213 Returns : a L<Bio::Seq> object
214 Args : none
216 See L<Bio::Root::IO>, L<Bio::Seq>
218 =cut
220 sub next_seq {
221 my $self = shift;
222 unless ( $self->{'_parsed'} ) {
223 $self->_parse;
225 return $self->{'_seqs'}->[ $self->{'_seqiter'}++ ];
228 =head2 next_aln
230 Title : next_aln
231 Usage : $aln = $stream->next_aln
232 Function: Reads the next aln object from the stream and returns it.
233 Returns : a L<Bio::SimpleAlign> object
234 Args : none
236 See L<Bio::Root::IO>, L<Bio::SimpleAlign>
238 =cut
240 sub next_aln {
241 my $self = shift;
242 unless ( $self->{'_parsed'} ) {
243 $self->_parse;
245 return $self->{'_alns'}->[ $self->{'_alniter'}++ ];
248 sub _rewind {
249 my $self = shift;
250 my $elt = shift;
251 $self->{"_${elt}iter"} = 0 if defined $self->{"_${elt}iter"};
252 return 1;
255 =head2 rewind_seq
257 Title : rewind_seq
258 Usage : $stream->rewind_seq
259 Function: Resets the stream for seqs
260 Returns : none
261 Args : none
263 See L<Bio::Root::IO>, L<Bio::Seq>
265 =cut
267 sub rewind_seq { shift->_rewind('seq'); }
269 =head2 rewind_aln
271 Title : rewind_aln
272 Usage : $stream->rewind_aln
273 Function: Resets the stream for alns
274 Returns : none
275 Args : none
277 See L<Bio::Root::IO>, L<Bio::Simple::Align>
279 =cut
281 sub rewind_aln { shift->_rewind('aln'); }
283 =head2 rewind_tree
285 Title : rewind_tree
286 Usage : $stream->rewind_tree
287 Function: Resets the stream for trees
288 Returns : none
289 Args : none
291 See L<Bio::Root::IO>, L<Bio::tree::tree>
293 =cut
295 sub rewind_tree { shift->_rewind('tree'); }
297 =head2 write
299 Title : write
300 Usage : $stream->write(-alns => $alns,-seqs => $seqs,-trees => $trees)
301 Function: converts BioPerl seq, tree, and aln objects into Bio::Phylo
302 seq, tree, and aln objects, constructs a Bio::Phylo::Project
303 object made up of the newly created Bio::Phylo objects, and
304 writes the Bio::Phylo:Project object to the stream as a valid
305 nexml document
306 Returns : none
307 Args : \@L<Bio::Seq>, \@L<Bio::SimpleAlign>, \@L<Bio::Tree::Tree>
309 See L<Bio::Root::IO>, L<Bio::tree::tree>, L<Bio::Seq>, L<Bio::SimpleAlign>
311 =cut
313 sub write {
314 my ($self, @args) = @_;
316 my %params = @args;
318 my ($trees, $alns, $seqs) = @params{qw( -trees -alns -seqs )};
319 my %taxa_hash = ();
320 my %seq_matrices = ();
322 my $proj_doc = Bio::Phylo::Factory->create_project();
324 #convert trees to bio::Phylo objects
325 my $forest = Bio::Phylo::Factory->create_forest();
326 my @forests;
327 my @taxa_array;
328 my $ent;
329 my $taxa_o;
330 my $phylo_tree_o;
332 foreach my $tree (@$trees) {
333 my $nexml_id = $tree->get_tag_values('_NexmlIO_ID');
334 $taxa_o = undef;
335 if ( defined $taxa_hash{$nexml_id} ) {
336 $taxa_o = $taxa_hash{$nexml_id};
338 else {
339 ($taxa_o) = $nexml_fac->create_bphylo_taxa($tree);
340 $forest->set_taxa($taxa_o) if defined $taxa_o;
341 $taxa_hash{$nexml_id} = $taxa_o;
344 ($phylo_tree_o) = $nexml_fac->create_bphylo_tree($tree, $taxa_o);
346 $forest->insert($phylo_tree_o);
349 #convert matrices to Bio::Phylo objects
350 my $matrices = Bio::Phylo::Matrices->new();
351 my $phylo_matrix_o;
353 foreach my $aln (@$alns)
355 $taxa_o = undef;
356 if (defined $taxa_hash{ $aln->{_Nexml_ID} }) {
357 $taxa_o = $taxa_hash{$aln->{_Nexml_ID}};
359 else {
360 ($taxa_o) = $nexml_fac->create_bphylo_taxa($aln);
361 $taxa_hash{$aln->{_Nexml_ID}} = $taxa_o;
364 ($phylo_matrix_o) = $nexml_fac->create_bphylo_aln($aln, $taxa_o);
366 $phylo_matrix_o->set_taxa($taxa_o) if defined $taxa_o;
367 $matrices->insert($phylo_matrix_o);
370 my $seq_matrix_o;
371 my $datum;
372 #convert sequences to Bio::Phylo objects
373 foreach my $seq (@$seqs)
375 $taxa_o = undef;
376 #check if this Bio::Phylo::Taxa obj has already been created
377 if (defined $taxa_hash{ $seq->{_Nexml_ID} }) {
378 $taxa_o = $taxa_hash{$seq->{_Nexml_ID}};
380 else {
381 ($taxa_o) = $nexml_fac->create_bphylo_taxa($seq);
382 $taxa_hash{$seq->{_Nexml_ID}} = $taxa_o;
384 $datum = $nexml_fac->create_bphylo_seq($seq, $taxa_o);
385 #check if this Bio::Phylo::Matrices::Matrix obj has already been created
386 if (defined $seq_matrices{ $seq->{_Nexml_matrix_ID} }) {
387 $seq_matrix_o = $seq_matrices{$seq->{_Nexml_matrix_ID}};
388 my $taxon_name = $datum->get_taxon()->get_name();
389 $datum->unset_taxon();
390 $seq_matrix_o->insert($datum);
391 $datum->set_taxon($seq_matrix_o->get_taxa()->get_by_name($taxon_name));
393 else {
394 $seq_matrix_o = Bio::Phylo::Factory->create_matrix('-type' => $datum->moltype);
395 $seq_matrices{$seq->{_Nexml_matrix_ID}} = $seq_matrix_o;
396 $seq_matrix_o->set_taxa($taxa_o) if defined $taxa_o;
397 $seq_matrix_o->insert($datum);
399 #get matrix label
400 my $feat = ($seq->get_SeqFeatures())[0];
401 my $matrix_label = ($feat->get_tag_values('matrix_label'))[0] if $feat->has_tag('id');
402 $seq_matrix_o->set_name($matrix_label);
404 $matrices->insert($seq_matrix_o);
408 #Add matrices and forest objects to project object which represents a complete nexml document
409 if($forest->first) {
410 $proj_doc->insert($forest);
412 while(my $curr_matrix = $matrices->next) {
413 $proj_doc->insert($curr_matrix);
416 #write nexml document to stream
417 my $ret = $self->_print($proj_doc->to_xml(-compact=>1));
418 $self->flush;
419 return($ret);
422 =head2 extract_seqs
424 Title : extract_seqs
425 Usage : $nexmlIO->extract_seqs(-file => ">$outfile", -format => $format)
426 Function: converts BioPerl seqs stored in the NexmlIO object into the provided
427 format and writes it to the provided file. Uses L<Bio::SeqIO> to do
428 the conversion and writing.
429 Returns : none
430 Args : file to write to, format to be converted to
432 See L<Bio::Seq>, L<Bio::SeqIO>
434 =cut
436 sub extract_seqs {
437 my $self = shift;
438 unless ( $self->{'_parsed'} ) {
439 $self->_parse;
442 my %params = @_;
443 my $remove_spaces = 0;
444 my $ret = 0;
445 my ($format, $file) = @params{qw( -format -file)};
447 for ($format) {
448 /^fasta$/i && do {
449 # this is ok, flag so that the nexmlid gets converted;
450 $remove_spaces = 1;
451 last;
453 # default
454 do {
455 $self->throw("Format '$format' not yet supported for extraction");
459 my $seqIO = Bio::SeqIO->new(-format => $format, -file => $file);
460 my $seqs = $self->{_seqs};
461 foreach my $seq (@$seqs) {
462 if ($remove_spaces) {
463 my $id = $seq->id;
464 $id =~ s/ /_/;
465 $seq->id($id);
467 $ret = $seqIO->write_seq($seq);
469 return $ret;
472 =head2 extract_alns
474 Title : extract_alns
475 Usage : $nexmlIO->extract_alns(-file => ">$outfile", -format => $format)
476 Function: converts BioPerl alns stored in the NexmlIO object into the provided
477 format and writes it to the provided file. Uses L<Bio::AlignIO> to do
478 the conversion and writing.
479 Returns : none
480 Args : file to write to, format to be converted to
482 See L<Bio::SimpleAlign>, L<Bio::AlignIO>
484 =cut
486 sub extract_alns {
487 my $self = shift;
488 unless ( $self->{'_parsed'} ) {
489 $self->_parse;
492 my $ret = 0;
493 my %params = @_;
494 my ($format, $file) = @params{qw( -format -file)};
496 my $alignIO = Bio::AlignIO->new(-format => $format, -file => $file);
497 my $alns = $self->{_alns};
498 foreach my $aln (@$alns) {
499 $ret = $alignIO->write_aln($aln);
501 return $ret;
504 =head2 extract_trees
506 Title : extract_trees
507 Usage : $nexmlIO->extract_trees(-file => ">$outfile", -format => $format)
508 Function: converts BioPerl trees stored in the NexmlIO object into the provided
509 format and writes it to the provided file. Uses L<Bio::TreeIO> to do
510 the conversion and writing.
511 Returns : none
512 Args : file to write to, format to be converted to
514 See L<Bio::Tree::Tree>, L<Bio::TreeIO>
516 =cut
518 sub extract_trees {
519 my $self = shift;
520 unless ( $self->{'_parsed'} ) {
521 $self->_parse;
524 my $ret = 0;
525 my %params = @_;
526 my ($format, $file) = @params{qw( -format -file)};
528 my $treeIO = Bio::TreeIO->new(-format => $format, -file => $file);
529 my $trees = $self->{_trees};
530 foreach my $tree (@$trees) {
531 $treeIO->write_tree($tree);
532 $ret = 1;
534 return $ret;