1 # $Id: Util.pm 15875 2009-07-21 19:20:00Z chmille4 $
3 # BioPerl module for Bio::Nexml::Factory
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chase Miller <chmille4@gmail.com>
9 # Copyright Chase Miller
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Nexml::Factory - A factory module for creating BioPerl and Bio::Phylo objects from/to nexml documents
21 Do not use this module directly. It shoulde be used through
22 Bio::NexmlIO, Bio::SeqIO::nexml, Bio::AlignIO::nexml, or
28 This is a factory/utility module in the Nexml namespace. It contains
29 methods that are needed by multiple modules.
31 This module handles the creation of BioPerl objects from Bio::Phylo
32 objects and vice versa, which is used to read and write nexml
33 documents to and from BioPerl objects.
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Chase Miller
67 Email chmille4@gmail.com
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
79 package Bio
::Nexml
::Factory
;
85 unless (eval "require Bio::Phylo; 1") {
86 Bio
::Root
::Root
->throw("Bio::Phylo package required; see http://www.nexml.org for download details");
90 use Bio
::Phylo
::Factory
;
91 use Bio
::Phylo
::Matrices
;
92 use Bio
::Phylo
::Matrices
::Matrix
;
93 use Bio
::Phylo
::Matrices
::Datum
;
94 use Bio
::Phylo
::Forest
::Tree
;
95 use Bio
::Phylo
::Matrices
;
98 use Bio
::SeqFeature
::Generic
;
101 use base
qw(Bio::Root::Root);
103 my $fac = Bio
::Phylo
::Factory
->new();
109 Usage : my $obj = Bio::Nexml::Factory->new();
110 Function: Builds a new L<Bio::Nexml::Factory> object
111 Returns : L<Bio::Nexml::Factory> object
117 my($class,@args) = @_;
118 my $self = $class->SUPER::new
(@args);
121 #should all these creates be private methods?
124 =head2 create_bperl_aln
126 Title : create_bperl_aln
127 Usage : my @alns = $factory->create_bperl_aln($objIO);
128 Function: Converts Bio::Phylo::Matrices::Matrix objects into L<Bio::SimpleAlign> objects
129 Returns : an array of L<Bio::SimpleAlign> objects
130 Args : Bio::NexmlIO, Bio::SeqIO, Bio::AlignIO, or Bio::TreeIO
132 see [http://search.cpan.org/~rvosa/Bio-Phylo/lib/Bio/Phylo/Project.pm Bio::Phylo::Project]
136 sub create_bperl_aln
{
137 my ($self, $caller) = @_;
138 my ($start, $end, $seq, $desc);
139 my $matrices = $caller->doc->get_matrices();
142 foreach my $matrix (@
$matrices)
144 #check if mol_type is something that makes sense to be an aln
145 my $mol_type = lc($matrix->get_type());
146 unless ($mol_type eq 'dna' || $mol_type eq 'rna' || $mol_type eq 'protein')
149 # something for the back-burner: BioPerl has objects
150 # to handle arbitrary genotypes; might be cool to
151 # be able to create something besides alignments
155 #continue creating an aln
156 my $aln = Bio
::SimpleAlign
->new();
157 my $taxa = $matrix->get_taxa();
159 # TODO: should $caller->{_ID} always be defined?
160 # ATM, this is a Bio::AlignIO::nexml stream...
161 $aln->{_Nexml_ID
} = $caller->{_ID
}?
$caller->{_ID
} . $taxa->get_xml_id : $taxa->get_xml_id;
163 my $aln_feats = Bio
::SeqFeature
::Generic
->new();
164 $aln_feats->add_tag_value('NexmlIO_ID', $caller->{_ID
});
165 #check if there is a taxa associated with this alignment
167 $aln_feats->add_tag_value('taxa_id', $taxa->get_xml_id());
168 $aln_feats->add_tag_value('taxa_label', $taxa->get_name()) if $taxa->get_name();
170 my $taxon = $taxa->first;
172 $aln_feats->add_tag_value('taxon', $taxon->get_name);
173 $taxon = $taxa->next;
176 $aln->add_SeqFeature($aln_feats);
178 my $basename = $matrix->get_name();
181 my$row = $matrix->first;
184 my $newSeq = $row->get_char();
188 #constuct seqID based on matrix label and row id
189 my $seqID = "$basename.row_$seqNum";
191 #Check if theres a row label and if not default to seqID
192 if( !defined($rowlabel = $row->get_name())) {$rowlabel = $seqID;}
194 $seq = Bio
::LocatableSeq
->new(
196 -display_id
=> "$rowlabel",
197 #-description => $desc,
198 -alphabet
=> $mol_type,
201 #check if there is a taxa associated w/ this alignment
204 if (my $taxon = $taxa->get_by_name($row->get_taxon->get_name())) {
205 #attach taxon to each sequence by using the sequenceID because
206 #LocatableSeq does not support features
207 my $taxon_name = $taxon->get_name();
208 $seq_feats = Bio
::SeqFeature
::Generic
->new();
209 $seq_feats->add_tag_value('taxon', "$taxon_name");
210 $seq_feats->add_tag_value('id', "$rowlabel");
214 $aln->add_SeqFeature($seq_feats);
215 $self->debug("Reading r$rowlabel\n");
217 $row = $matrix->next();
225 =head2 create_bperl_tree
227 Title : create_bperl_tree
228 Usage : my @trees = $factory->create_bperl_seq($objIO);
229 Function: Converts Bio::Phylo::Forest::Tree objects into L<Bio::Tree::Tree> objects
230 Returns : an array of L<Bio::Tree::Tree> objects
231 Args : Bio::NexmlIO, Bio::SeqIO, Bio::AlignIO, or Bio::TreeIO
233 see [http://search.cpan.org/~rvosa/Bio-Phylo/lib/Bio/Phylo/Project.pm Bio::Phylo::Project]
237 sub create_bperl_tree
{
238 my($self, $caller) = @_;
241 my $forests = $caller->doc->get_forests();
243 foreach my $forest (@
$forests)
245 my $basename = $forest->get_name() || '';
246 my $taxa = $forest->get_taxa();
247 my $taxa_label = $taxa->get_name();
248 my $taxa_id = $taxa->get_xml_id();
250 my $t = $forest->first();
255 my $tree_id = $t->get_name();
256 my $tree = Bio
::Tree
::Tree
->new(-id
=> "$basename.$tree_id");
258 #set the taxa info of the tree
259 $tree->add_tag_value('taxa_label', $taxa_label) if defined($taxa_label);
260 $tree->add_tag_value('taxa_id', $taxa_id) if defined($taxa_id);
262 # TODO: should $caller->{_ID} always be defined?
263 # ATM, this is a Bio::TreeIO::nexml stream...
264 $tree->add_tag_value('_NexmlIO_ID', $caller->{_ID
}) if $caller->{_ID
};
266 my $taxon = $taxa->first;
268 $tree->add_tag_value('taxon', $taxon->get_name()) if defined($taxon->get_name);
269 $taxon = $taxa->next;
272 #process terminals only, removing terminals as they get processed
273 #which inturn creates new terminals to process until the entire tree has been processed
274 my $terminals = $t->get_terminals();
275 # for(my $i=0; $i<@$terminals; $i++)
276 while (my $terminal = shift @
$terminals)
278 # my $terminal = $$terminals[$i];
279 my $new_node_id = $terminal->get_name();
282 if(exists $created_nodes{$new_node_id})
284 $newNode = $created_nodes{$new_node_id};
288 $newNode = Bio
::Tree
::Node
->new();
289 $new_node_id ||= 'internal_'.$newNode->_creation_id;
290 $newNode->id($new_node_id);
292 $created_nodes{$new_node_id} = $newNode;
295 #check if taxa data exists for the current node ($terminal)
297 my $taxon = $terminal->get_taxon();
298 $newNode->add_tag_value("taxon", $taxon->get_name()) if $taxon && $taxon->get_name;
301 #check if you've reached the root of the tree and if so, stop.
302 if($terminal->is_root()) {
303 $tree->set_root_node($newNode);
307 #transfer attributes that apply to non-root only nodes
308 $newNode->branch_length($terminal->get_branch_length());
310 my $parent = $terminal->get_parent();
311 my $parentID = $parent->get_name();
312 if(exists $created_nodes{$parentID})
315 $created_nodes{$parentID}->add_Descendent($newNode);
319 my $parent_node = Bio
::Tree
::Node
->new();
320 $parentID ||= 'internal_'.$parent_node->_creation_id;
321 $parent_node->id($parentID);
322 $parent_node->add_Descendent($newNode);
323 $created_nodes{$parentID} = $parent_node;
325 #remove processed node from tree
326 $parent->prune_child($terminal);
328 #check if the parent of the removed node is now a terminal node and should be added for processing
329 if($parent->is_terminal())
331 push(@
$terminals, $terminal->get_parent()) if $terminal->get_parent;
335 $t = $forest->next();
341 =head2 create_bperl_seq
343 Title : create_bperl_seq
344 Usage : my @seqs = $factory->create_bperl_seq($objIO);
345 Function: Converts Bio::Phylo::Matrices::Datum objects into L<Bio::Seq> objects
346 Returns : an array of L<Bio::Seq> objects
347 Args : Bio::NexmlIO, Bio::SeqIO, Bio::AlignIO, or Bio::TreeIO
349 see [http://search.cpan.org/~rvosa/Bio-Phylo/lib/Bio/Phylo/Project.pm Bio::Phylo::Project]
353 sub create_bperl_seq
{
354 my($self, $caller) = @_;
355 my $matrices = $caller->doc->get_matrices();
358 foreach my $matrix (@
$matrices)
360 #check if mol_type is something that makes sense to be a seq
361 my $mol_type = lc($matrix->get_type());
362 unless ($mol_type eq 'dna' || $mol_type eq 'rna' || $mol_type eq 'protein')
367 my $taxa = $matrix->get_taxa();
369 my $taxa_id = $taxa->get_xml_id();
370 my $taxa_label = $taxa->get_name();
371 my $basename = $matrix->get_name();
372 my $row = $matrix->first;
375 my $newSeq = $row->get_char();
376 my $feat = Bio
::SeqFeature
::Generic
->new();
377 $feat->add_tag_value('matrix_label', $matrix->get_name()) if defined($matrix->get_name);
378 $feat->add_tag_value('matrix_id', $matrix->get_xml_id());
379 $feat->add_tag_value('NexmlIO_ID', $caller->{_ID
});
380 $feat->add_tag_value('taxa_id', $taxa_id) if defined($taxa_id);
381 $feat->add_tag_value('taxa_label', $taxa_label) if defined($taxa_label);
384 #construct full sequence id by using bio::phylo "matrix label" and "row id"
385 my $seqID = "$basename.seq_$seqnum";
387 #check if there is a label for the row, if not default to seqID
388 if (!defined ($rowlabel = $row->get_name())) {$rowlabel = $seqID;}
389 else {$seqID = $rowlabel;}
391 #build the seq object using the factory create method
392 my $seqbuilder = Bio
::Seq
::SeqFactory
->new('-type' => 'Bio::Seq');
393 my $seq = $seqbuilder->create(
396 -primary_id
=> $seqID,
398 -alphabet
=> $mol_type,
401 # TODO: should $caller->{_ID} always be defined?
402 # ATM, this is a Bio::SeqIO::nexml stream...
403 $seq->{_Nexml_ID
} = $caller->{_ID
} ?
$caller->{_ID
} . $taxa_id : $taxa_id;
404 $seq->{_Nexml_matrix_ID
} = $caller->{_ID
} ?
$caller->{_ID
} . $matrix->get_xml_id() : $matrix->get_xml_id();
406 #check if taxon linked to sequence if so create feature to attach to alignment
408 my $taxon = $taxa->first;
410 $feat->add_tag_value('taxon', $taxon->get_name) if defined($taxon->get_name);
411 if($taxon eq $row->get_taxon) {
412 my $taxon_name = $taxon->get_name();
414 $feat->add_tag_value('my_taxon', "$taxon_name") if defined($taxon_name);
415 $feat->add_tag_value('id', $rowlabel);
417 $taxon = $taxa->next;
420 $seq->add_SeqFeature($feat);
423 $row = $matrix->next;
429 =head2 create_bphylo_tree
431 Title : create_bphylo_tree
432 Usage : my $bphylo_tree = $factory->create_bphylo_tree($bperl_tree);
433 Function: Converts a L<Bio::Tree::Tree> object into Bio::Phylo::Forest::Tree object
434 Returns : a Bio::Phylo::Forest::Tree object
435 Args : Bio::Tree::Tree object
439 sub create_bphylo_tree
{
440 my ($self, $bptree, $taxa) = @_;
441 #most of the code below ripped form Bio::Phylo::Forest::Tree::new_from_bioperl()d
443 my $tree = $fac->create_tree;
444 my $class = 'Bio::Phylo::Forest::Tree';
446 if ( ref $bptree && $bptree->isa('Bio::Tree::TreeI') ) {
448 ($tree) = _copy_tree
( $tree, $bptree->get_root_node, "", $taxa);
451 my $name = $bptree->id;
452 $tree->set_name( $name ) if defined $name;
455 my $score = $bptree->score;
456 $tree->set_score( $score ) if defined $score;
459 $self->throw('Not a bioperl tree!');
466 my ( $tree, $bpnode, $parent, $taxa ) = @_;
467 my $node = create_bphylo_node
($bpnode);
470 $parent->set_child($node);
472 if (my $bptaxon_name = $bpnode->get_tag_values('taxon'))
474 $node->set_taxon($taxa->get_by_name($bptaxon_name));
476 $tree->insert($node);
477 foreach my $bpchild ( $bpnode->each_Descendent ) {
478 _copy_tree
( $tree, $bpchild, $node, $taxa );
483 =head2 create_bphylo_node
485 Title : create_bphylo_node
486 Usage : my $bphylo_node = $factory->create_bphylo_node($bperl_node);
487 Function: Converts a L<Bio::Tree::Node> object into Bio::Phylo::Forest::Node object
488 Returns : a Bio::Phylo::Forest::Node object
489 Args : L<Bio::Tree::Node> object
493 sub create_bphylo_node
{
495 my $node = Bio
::Phylo
::Forest
::Node
->new();
497 #mostly ripped from Bio::Phylo::Forest::Node->new_from_bioperl()
499 my $name = $bpnode->id;
500 $node->set_name( $name ) if defined $name;
503 my $branch_length = $bpnode->branch_length;
504 $node->set_branch_length( $branch_length ) if defined $branch_length;
507 my $desc = $bpnode->description;
508 $node->set_desc( $desc ) if defined $desc;
511 my $bootstrap = $bpnode->bootstrap;
512 $node->set_score( $bootstrap ) if defined $bootstrap and looks_like_number
$bootstrap;
515 for my $tag ( $bpnode->get_all_tags ) {
516 my @values = $bpnode->get_tag_values( $tag );
517 $node->set_generic( $tag => \
@values );
523 =head2 create_bphylo_aln
525 Title : create_bphylo_aln
526 Usage : my $bphylo_aln = $factory->create_bphylo_aln($bperl_aln);
527 Function: Converts a L<Bio::SimpleAlign> object into Bio::Phylo::Matrices::Matrix object
528 Returns : a Bio::Phylo::Matrices::Matrix object
529 Args : Bio::SimpleAlign object
533 sub create_bphylo_aln
{
535 my ($self, $aln, $taxa, @args) = @_;
537 #most of the code below ripped from Bio::Phylo::Matrices::Matrix::new_from_bioperl()
538 if ( $aln->isa('Bio::Align::AlignI') ) {
540 $aln->map_chars('\.','-');
541 my @seqs = $aln->each_seq;
542 my ( $type, $missing, $gap, $matchchar );
544 $type = $seqs[0]->alphabet || $seqs[0]->_guess_alphabet || 'dna';
550 my $matrix = $fac->create_matrix(
552 '-special_symbols' => {
553 '-missing' => $aln->missing_char || '?',
554 '-matchchar' => $aln->match_char || '.',
555 '-gap' => $aln->gap_char || '-',
559 # XXX create raw getter/setter pairs for annotation, accession, consensus_meta source
560 for my $field ( qw(description accession id annotation consensus_meta score source) ) {
561 $matrix->$field( $aln->$field );
563 my $to = $matrix->get_type_object;
564 my @feats = $aln->get_all_SeqFeatures();
566 for my $seq ( @seqs ) {
567 #create datum linked to taxa
568 my $datum = create_bphylo_datum
($seq, $taxa, \
@feats, '-type_object' => $to);
569 $matrix->insert($datum);
574 $self->throw('Not a bioperl alignment!');
580 =head2 create_bphylo_seq
582 Title : create_bphylo_seq
583 Usage : my $bphylo_seq = $factory->create_bphylo_seq($bperl_seq);
584 Function: Converts a L<Bio::Seq> object into Bio::Phylo::Matrices::Matrix object
585 Returns : a Bio::Phylo::Matrices::Matrix object
586 Args : Bio::Seq object
590 sub create_bphylo_seq
{
591 my ($self, $seq, $taxa, @args) = @_;
592 my $type = $seq->alphabet || $seq->_guess_alphabet || 'dna';
595 my $dat = create_bphylo_datum
($seq, $taxa, '-type' => $type);
598 my $seqstring = $seq->seq;
599 if ( $seqstring and $seqstring =~ /\S/ ) {
600 eval { $dat->set_char( $seqstring ) };
601 if ( $@
and UNIVERSAL
::isa
($@
,'Bio::Phylo::Util::Exceptions::InvalidData') ) {
602 $self->throw("\n\nThe BioPerl sequence object contains invalid data ($seqstring)\n");
607 my $name = $seq->display_id;
608 #$dat->set_name( $name ) if defined $name;
611 my $desc = $seq->desc;
612 $dat->set_desc( $desc ) if defined $desc;
614 #get features from SeqFeatureI
615 for my $field ( qw(start end strand) ) {
616 $dat->$field( $seq->$field ) if $seq->can($field);
621 =head2 create_bphylo_taxa
623 Title : create_bphylo_seq
624 Usage : my $taxa = $factory->create_bphylo_taxa($bperl_obj);
625 Function: creates a taxa object from the data attached to a bioperl object
626 Returns : a Bio::Phylo::Taxa object
627 Args : L<Bio::Seq> object, or L<Bio::SimpleAlign> object, or L<Bio::Tree::Tree> object
631 sub create_bphylo_taxa
{
635 #check if tree or aln object
636 if ( UNIVERSAL
::isa
( $obj, 'Bio::Align::AlignI' ) || UNIVERSAL
::isa
( $obj, 'Bio::Seq')) {
637 return $self->_create_bphylo_matrix_taxa(@_);
639 elsif ( UNIVERSAL
::isa
( $obj, 'Bio::Tree::TreeI' ) ) {
640 return $self->_create_bphylo_tree_taxa(@_);
644 sub _create_bphylo_tree_taxa
{
645 my ($self, $tree) = @_;
647 my $taxa = $fac->create_taxa();
650 #check if taxa exists
651 unless ($tree->has_tag('taxa_id')) {
656 $taxa->set_xml_id(($tree->get_tag_values('taxa_id'))[0]);
657 $taxa->set_name(($tree->get_tag_values('taxa_label'))[0]);
659 foreach my $taxon_name ($tree->get_tag_values('taxon')) {
661 $taxon = $fac->create_taxon(-name
=> $taxon_name);
662 $taxa->insert($taxon);
667 sub _create_bphylo_matrix_taxa
{
668 my ($self, $aln) = @_;
670 my $taxa = $fac->create_taxa();
672 my @feats = $aln->get_all_SeqFeatures();
674 foreach my $feat (@feats) {
675 if (my $taxa_id = ($feat->get_tag_values('taxa_id'))[0]) {
676 my $taxa_label = ($feat->get_tag_values('taxa_label'))[0];
678 $taxa->set_name($taxa_label) if defined $taxa_label;
679 $taxa->set_xml_id($taxa_id) if defined $taxa_label;
680 my @taxa_bp = $feat->get_tag_values('taxon');
681 foreach my $taxon_name (@taxa_bp) {
682 $taxon = $fac->create_taxon(-name
=> $taxon_name);
683 $taxa->insert($taxon);
691 =head2 create_bphylo_datum
693 Title : create_bphylo_datum
694 Usage : my $bphylo_datum = $factory->create_bphylo_datum($bperl_datum);
695 Function: Converts a L<Bio::Seq> object into Bio::Phylo::Matrices::datum object
696 Returns : a Bio::Phylo::Matrices::datum object
697 Args : Bio::Seq object, Bio::Phylo::Taxa object,
698 [optional] arrayref to SeqFeatures,
699 [optional] key => value pairs to pass to Bio::Phylo constructor
703 sub create_bphylo_datum
{
704 #mostly ripped from Bio::Phylo::Matrices::Datum::new_from_bioperl()
705 my ( $seq, $taxa, @args ) = @_;
706 my $class = 'Bio::Phylo::Matrices::Datum';
708 # want $seq type-check here? Allowable: is-a Bio::PrimarySeq,
709 # Bio::LocatableSeq /maj
710 if (@args % 2) { # odd
711 $feats = shift @args;
712 unless (ref($feats) eq 'ARRAY') {
713 Bio
::Root
::Root
->throw("Third argument must be array of SeqFeatures");
716 my $type = $seq->alphabet || $seq->_guess_alphabet || 'dna';
717 my $self = $class->new( '-type' => $type, @args );
719 my $seqstring = $seq->seq;
720 if ( $seqstring and $seqstring =~ /\S/ ) {
721 eval { $self->set_char( $seqstring ) };
722 if ( $@
and UNIVERSAL
::isa
($@
,'Bio::Phylo::Util::Exceptions::InvalidData') ) {
723 $self->throw("\n\nThe BioPerl sequence object contains invalid data ($seqstring)\n");
728 my $name = $seq->display_id;
729 $self->set_name( $name ) if defined $name;
731 my @feats = (defined $feats ? @
$feats : $seq->get_all_SeqFeatures);
733 foreach my $feat (@feats)
735 #get sequence id associated with taxa to compare
736 my $taxa_id = ($feat->get_tag_values('id'))[0] if $feat->has_tag('id');
737 if ($taxa_id && $name eq $taxa_id)
740 if($feat->has_tag('my_taxon')) {
741 $taxon_name = ($feat->get_tag_values('my_taxon'))[0]
744 $taxon_name = ($feat->get_tag_values('taxon'))[0];
746 $self->set_taxon($taxa->get_by_name($taxon_name));
751 my $desc = $seq->desc;
752 $self->set_desc( $desc ) if defined $desc;
754 # only Bio::LocatableSeq objs have these fields...
755 for my $field ( qw(start end strand) ) {
756 $self->$field( $seq->$field ) if $seq->can($field);
765 =head1 bioperl_create
767 Title : bioperl_create
768 Usage : $bioperl_obj = $fac->bioperl_create($obj_type, $biophylo_proj);
769 Function: Create a specified bioperl object using a Bio::Phylo project
770 Args : scalar string ('aln', 'tree', 'seq') type designator
771 Bio::Phylo::Project object
772 Returns : Appropriate BioPerl object
778 my ($type, @args) = @_;
779 unless (grep /^type/,qw( seq aln tree )) {
780 $self->throw("Unrecognized type for argument 1");
782 my $call = 'create_bioperl_'.$type;
783 return $self->$call(@args);