3 # BioPerl module for Bio::OntologyIO::dagflat
5 # Cared for by Hilmar Lapp, hlapp at gmx.net
7 # (c) Christian M. Zmasek, czmasek@gnf.org, 2002.
8 # (c) Hilmar Lapp, hlapp at gmx.net, 2003.
9 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
11 # You may distribute this module under the same terms as perl itself.
12 # Refer to the Perl Artistic License (see the license accompanying this
13 # software package, or see http://www.perl.com/language/misc/Artistic.html)
14 # for the terms under which you may use, modify, and redistribute this module.
16 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
17 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
18 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
20 # You may distribute this module under the same terms as perl itself
22 # POD documentation - main docs before the code
26 Bio::OntologyIO::dagflat - a base class parser for GO flat-file type formats
32 # do not use directly -- use via Bio::OntologyIO
33 # e.g., the GO parser is a simple extension of this class
34 my $parser = Bio::OntologyIO->new
36 -defs_file => "/home/czmasek/GO/GO.defs",
37 -files => ["/home/czmasek/GO/component.ontology",
38 "/home/czmasek/GO/function.ontology",
39 "/home/czmasek/GO/process.ontology"] );
41 my $go_ontology = $parser->next_ontology();
43 my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" );
44 my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" );
45 my $RELATED_TO = Bio::Ontology::RelationshipType->get_instance( "RELATED_TO" );
49 Needs Graph.pm from CPAN.
55 User feedback is an integral part of the evolution of this and other
56 Bioperl modules. Send your comments and suggestions preferably to the
57 Bioperl mailing lists Your participation is much appreciated.
59 bioperl-l@bioperl.org - General discussion
60 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
68 http://bugzilla.open-bio.org/
74 Email: czmasek@gnf.org or cmzmasek@yahoo.com
76 WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/
80 Genomics Institute of the Novartis Research Foundation
81 10675 John Jay Hopkins Drive
86 Hilmar Lapp, hlapp at gmx.net
90 The rest of the documentation details each of the object
91 methods. Internal methods are usually preceded with a _
96 # Let the code begin...
99 package Bio
::OntologyIO
::dagflat
;
104 use Bio
::Ontology
::OBOEngine
;
105 use Bio
::Ontology
::Ontology
;
106 use Bio
::Ontology
::OntologyStore
;
107 use Bio
::Ontology
::TermFactory
;
108 use Bio
::Annotation
::DBLink
;
110 use constant TRUE
=> 1;
111 use constant FALSE
=> 0;
114 use base
qw(Bio::OntologyIO);
120 Usage : $parser = Bio::OntologyIO->new(
122 -defs_file => "/path/to/GO.defs",
123 -files => ["/path/to/component.ontology",
124 "/path/to/function.ontology",
125 "/path/to/process.ontology"] );
126 Function: Creates a new dagflat parser.
127 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
128 Args : -defs_file => the name of the file holding the term
130 -files => a single ontology flat file holding the
131 term relationships, or an array ref holding
132 the file names (for GO, there will usually be
133 3 files: component.ontology, function.ontology,
135 -file => if there is only a single flat file, it may
136 also be specified via the -file parameter
137 -ontology_name => the name of the ontology; if not specified the
138 parser will auto-discover it by using the term
139 that starts with a $, and converting underscores
141 -engine => the Bio::Ontology::OntologyEngineI object
142 to be reused (will be created otherwise); note
143 that every Bio::Ontology::OntologyI will
144 qualify as well since that one inherits from the
147 See L<Bio::OntologyIO>.
151 # in reality, we let OntologyIO::new do the instantiation, and override
152 # _initialize for all initialization work
154 my ($self, %arg) = @_;
156 my ( $defs_file_name,$files,$defs_url,$url,$name,$eng ) =
157 $self->_rearrange([qw( DEFS_FILE
166 delete($arg{-url
}); #b/c GO has 3 files...
168 $self->SUPER::_initialize
( %arg );
170 $self->_done( FALSE
);
171 $self->_not_first_record( FALSE
);
173 delete $self->{'_ontologies'};
175 # ontology engine (and possibly name if it's an OntologyI)
176 $eng = Bio
::Ontology
::OBOEngine
->new() unless $eng;
177 if($eng->isa("Bio::Ontology::OntologyI")) {
178 $self->ontology_name($eng->name());
179 $eng = $eng->engine() if $eng->can('engine');
181 $self->_ont_engine($eng);
183 # flat files to parse
184 if(defined($defs_file_name) && defined($defs_url)){
185 $self->throw('cannot provide both -defs_file and -defs_url');
187 defined($defs_file_name) && $self->defs_file( $defs_file_name );
188 defined($defs_url) && $self->defs_url( $defs_url );
191 if(defined($files) && defined($url)){
192 } elsif(defined($files)){
193 $self->{_flat_files
} = $files ?
ref($files) ?
$files : [$files] : [];
194 } elsif(defined($url)){
198 # ontology name (overrides implicit one through OntologyI engine)
199 $self->ontology_name($name) if $name;
205 Title : ontology_name
206 Usage : $obj->ontology_name($newval)
207 Function: Get/set the name of the ontology parsed by this module.
209 Returns : value of ontology_name (a scalar)
210 Args : on set, new value (a scalar or undef, optional)
218 return $self->{'ontology_name'} = shift if @_;
219 return $self->{'ontology_name'};
226 Usage : $parser->parse();
227 Function: Parses the files set with "new" or with methods
228 defs_file and _flat_files.
230 Normally you should not need to call this method as it will
231 be called automatically upon the first call to
234 Returns : [Bio::Ontology::OntologyEngineI]
243 # setup the default term factory if not done by anyone yet
244 $self->term_factory(Bio
::Ontology
::TermFactory
->new(
245 -type
=> "Bio::Ontology::Term"))
246 unless $self->term_factory();
248 # create the ontology object itself
249 my $ont = Bio
::Ontology
::Ontology
->new(-name
=> $self->ontology_name(),
250 -engine
=> $self->_ont_engine());
253 while( my $term = $self->_next_term() ) {
254 $self->_add_term( $term, $ont );
257 # set up the ontology of the relationship types
258 foreach ($self->_part_of_relationship(), $self->_is_a_relationship(), $self->_related_to_relationship()) {
262 # pre-seed the IO system with the first flat file if -file wasn't provided
265 if(ref($self->url) eq 'ARRAY'){
267 foreach my $url (@
{ $self->url }){
270 #warn scalar($ont->get_all_terms());
271 $self->_initialize_io(-url
=> $url);
272 $self->_parse_flat_file($ont);
276 $self->_initialize_io(-url
=> $self->url);
278 } elsif($self->_flat_files){
279 $self->_initialize_io(-file
=> shift(@
{$self->_flat_files()}));
284 $self->_parse_flat_file($ont);
285 # advance to next flat file if more are available
286 if(@
{$self->_flat_files()}) {
288 $self->_initialize_io(-file
=> shift(@
{$self->_flat_files()}));
290 last; # nothing else to parse so terminate the loop
293 $self->_add_ontology($ont);
295 # not needed anywhere, only because of backward compatibility
296 return $self->_ont_engine();
301 Title : next_ontology
303 Function: Get the next available ontology from the parser. This is the
304 method prescribed by Bio::OntologyIO.
306 Returns : An object implementing Bio::Ontology::OntologyI, and undef if
307 there is no more ontology in the input.
316 # parse if not done already
317 $self->parse() unless exists($self->{'_ontologies'});
318 # return next available ontology
319 if(exists($self->{'_ontologies'})){
320 my $ont = shift (@
{$self->{'_ontologies'}});
322 my $store = Bio
::Ontology
::OntologyStore
->new();
323 $store->register_ontology($ont);
333 Usage : $parser->defs_file( "GO.defs" );
334 Function: Set/get for the term definitions filename.
335 Returns : The term definitions file name [string].
336 Args : On set, the term definitions file name [string] (optional).
345 $self->{ "_defs_file_name" } = $f;
346 $self->_defs_io->close() if $self->_defs_io();
348 $self->_defs_io( Bio
::Root
::IO
->new( -input
=> $f ) );
351 return $self->{ "_defs_file_name" };
358 $self->{'_defs_url'} = $val;
360 $self->_defs_io->close() if $self->_defs_io();
361 $self->_defs_io( Bio
::Root
::IO
->new( -url
=> $val ) );
363 return $self->{'_defs_url'};
370 $self->{'_url'} = $val;
372 return $self->{'_url'};
379 Function: Closes this ontology stream and associated file handles.
381 Clients should call this method especially when they write
384 We need to override this here in order to close the file
385 handle for the term definitions file.
397 # first call the inherited implementation
398 $self->SUPER::close();
399 # then close the defs file io (if there is one)
400 $self->_defs_io->close() if $self->_defs_io();
406 Usage : $files_to_parse = $parser->_flat_files();
407 Function: Get the array of ontology flat files that need to be parsed.
409 Note that this array will decrease in elements over the
410 parsing process. Therefore, it\'s value outside of this
411 module will be limited. Also, be careful not to alter the
412 array unless you know what you are doing.
414 Returns : a reference to an array of zero or more strings
422 $self->{_flat_files
} = [] unless exists($self->{_flat_files
});
423 return $self->{_flat_files
};
433 Usage : $obj->_defs_io($newval)
434 Function: Get/set the Bio::Root::IO instance representing the
435 definition file, if provided (see defs_file()).
437 Returns : value of _defs_io (a Bio::Root::IO object)
438 Args : on set, new value (a Bio::Root::IO object or undef, optional)
445 return $self->{'_defs_io'} = shift if @_;
446 return $self->{'_defs_io'};
451 $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'});
452 foreach my $ont (@_) {
453 $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI")
454 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
455 # the ontology name may have been auto-discovered while parsing
457 $ont->name($self->ontology_name) unless $ont->name();
458 push(@
{$self->{'_ontologies'}}, $ont);
462 # This simply delegates. See SimpleGOEngine.
464 my ( $self, $term, $ont ) = @_;
465 $term->ontology($ont) if $ont && (! $term->ontology);
466 $self->_ont_engine()->add_term( $term );
471 # This simply delegates. See SimpleGOEngine
472 sub _part_of_relationship
{
475 return $self->_ont_engine()->part_of_relationship(@_);
476 } # _part_of_relationship
480 # This simply delegates. See SimpleGOEngine
481 sub _is_a_relationship
{
484 return $self->_ont_engine()->is_a_relationship(@_);
485 } # _is_a_relationship
487 # This simply delegates. See SimpleGOEngine
488 sub _related_to_relationship
{
491 return $self->_ont_engine()->related_to_relationship(@_);
492 } # _is_a_relationship
496 # This simply delegates. See SimpleGOEngine
497 sub _add_relationship
{
498 my ( $self, $parent, $child, $type, $ont ) = @_;
500 # note the triple terminology (subject,predicate,object) corresponds to
501 # (child,type,parent)
502 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
505 } # _add_relationship
508 # This simply delegates. See SimpleGOEngine
512 return $self->_ont_engine()->has_term( @_ );
517 # This parses the relationships files
518 sub _parse_flat_file
{
523 my $prev_spaces = -1;
526 while ( my $line = $self->_readline() ) {
528 if ( $line =~ /^!/ ) {
532 # split into term specifications
533 my @termspecs = split(/ (?=[%<])/, $line);
534 # the first element is whitespace only
535 shift(@termspecs) if $termspecs[0] =~ /^\s*$/;
537 # parse out the focus term
538 my $current_term = $self->_get_first_termid( $termspecs[0] );
539 my @syns = $self->_get_synonyms( $termspecs[0] );
540 my @sec_go_ids = $self->_get_secondary_termids( $termspecs[0] );
541 my @cross = $self->_get_db_cross_refs( $termspecs[0] );
543 foreach my $cross_ref (@cross) {
544 $cross_ref eq $current_term && next;
545 push(@cross_refs, $cross_ref);
548 # parse out the parents of the focus term
550 my @isa_parents = ();
551 my @partof_parents = ();
552 foreach my $parent (@termspecs) {
553 if (index($parent, "%") == 0) {
554 push(@isa_parents, $self->_get_first_termid($parent));
555 } elsif (index($parent, "<") == 0) {
556 push(@partof_parents, $self->_get_first_termid($parent));
558 $self->warn("unhandled relationship type in '".$parent."'");
562 if ( ! $self->_has_term( $current_term ) ) {
563 my $term =$self->_create_ont_entry($self->_get_name($line,
566 $self->_add_term( $term, $ont );
569 my $current_term_object = $self->_ont_engine()->get_terms( $current_term );
570 my $anno = $self->_to_annotation(\
@cross_refs);
571 $current_term_object->add_dbxref(-dbxrefs
=> $anno);
572 $current_term_object->add_secondary_id( @sec_go_ids );
573 $current_term_object->add_synonym( @syns );
574 unless ( $line =~ /^\$/ ) {
575 $current_term_object->ontology( $ont );
577 foreach my $parent ( @isa_parents ) {
578 if ( ! $self->_has_term( $parent ) ) {
579 my $term = $self->_create_ont_entry($self->_get_name($line,
582 $self->_add_term( $term, $ont );
585 $self->_add_relationship( $parent,
587 $self->_is_a_relationship(),
591 foreach my $parent ( @partof_parents ) {
592 if ( ! $self->_has_term( $parent ) ) {
593 my $term = $self->_create_ont_entry($self->_get_name($line,
596 $self->_add_term( $term, $ont );
599 $self->_add_relationship( $parent,
601 $self->_part_of_relationship(),
605 my $current_spaces = $self->_count_spaces( $line );
607 if ( $current_spaces != $prev_spaces ) {
609 if ( $current_spaces == $prev_spaces + 1 ) {
610 push( @stack, $prev_term );
611 } elsif ( $current_spaces < $prev_spaces ) {
612 my $n = $prev_spaces - $current_spaces;
613 for ( my $i = 0; $i < $n; ++$i ) {
617 $self->throw( "format error (file ".$self->file.")" );
621 my $parent = $stack[ @stack - 1 ];
623 # add a relationship if the line isn\'t the one with the root term
624 # of the ontology (which is also the name of the ontology)
625 if ( index($line,'$') != 0 ) {
626 #adding @reltype@ syntax
627 if ( $line !~ /^\s*([<%~]|\@\w+?\@)/ ) {
628 $self->throw( "format error (file ".$self->file.") offending line:\n$line" );
631 my($relstring) = $line =~ /^\s*([<%~]|\@[^\@]+?\@)/;
635 if ($relstring eq '<') {
636 $reltype = $self->_part_of_relationship;
637 } elsif ($relstring eq '%') {
638 $reltype = $self->_is_a_relationship;
639 } elsif ($relstring eq '~') {
640 $reltype = $self->_related_to_relationship;
642 $relstring =~ s/\@//g;
643 if ($self->_ont_engine->get_relationship_type($relstring)) {
644 $reltype = $self->_ont_engine->get_relationship_type($relstring);
646 $self->_ont_engine->add_relationship_type($relstring, $ont);
647 $reltype = $self->_ont_engine->get_relationship_type($relstring);
651 #my $reltype = ($line =~ /^\s*</) ?
652 #$self->_part_of_relationship() :
653 #$self->_is_a_relationship();
654 $self->_add_relationship( $parent, $current_term, $reltype, $ont);
657 $prev_spaces = $current_spaces;
658 $prev_term = $current_term;
661 } # _parse_relationships_file
665 # Parses the 1st term id number out of line.
666 sub _get_first_termid
{
667 my ( $self, $line ) = @_;
668 if ( $line =~ /;\s*([A-Z_]{1,8}:\d{1,})/ ) {
669 # if ( $line =~ /;\s*(\w+:\w+)/ ) {
673 $self->throw( "format error: no term id in line \"$line\"" );
676 } # _get_first_termid
680 # Parses the name out of line.
682 my ( $self, $line, $termid ) = @_;
684 if ( $line =~ /([^;<%~]+);\s*$termid/ ) {
686 # remove trailing and leading whitespace
689 $name =~ s/\@.+?\@//;
690 # remove leading dollar character; also we default the name of the
691 # ontology to this name unless it is preset to something else
692 if(index($name,'$') == 0) {
693 $name = substr($name,1);
694 # replace underscores by spaces for setting the ontology name
695 $self->ontology_name(join(" ",split(/_/,$name)))
696 unless $self->ontology_name();
706 # Parses the synonyms out of line.
708 my ( $self, $line ) = @_;
712 while ( $line =~ /synonym\s*:\s*([^;<%~]+)/g ) {
716 push( @synonyms, $syn );
724 # Parses the db cross refs out of line.
725 sub _get_db_cross_refs
{
726 my ( $self, $line ) = @_;
730 while ( $line =~ /;([^;<%~:]+:[^;<%~:]+)/g ) {
732 if ( $ref =~ /synonym/ || $ref =~ /[A-Z]{1,8}:\d{3,}/ ) {
738 $ref = $self->unescape( $ref );
740 push( @refs, $ref ) if defined $ref;
747 # Parses the secondary go ids out of a line
748 sub _get_secondary_termids
{
749 my ( $self, $line ) = @_;
752 # while ( $line =~ /,\s*([A-Z]{1,8}:\d{3,})/g ) {
753 while ( $line =~ /,\s*(\w+:\w+)/g ) {
759 } # _get_secondary_termids
762 # Counts the spaces at the beginning of a line in the relationships files
764 my ( $self, $line ) = @_;
766 if ( $line =~ /^(\s+)/ ) {
775 # "next" method for parsing the defintions file
779 if ( ($self->_done() == TRUE
) || (! $self->_defs_io())) {
785 my $next_term = $self->_term();
791 while( $line = ( $self->_defs_io->_readline() ) ) {
793 || $line =~ /^\s*!/ ) {
796 elsif ( $line =~ /^\s*term:\s*(.+)/ ) {
798 last if $self->_not_first_record();
800 $self->_not_first_record( TRUE
);
802 elsif ( $line =~ /^\s*[a-z]{0,8}id:\s*(.+)/ ) {
805 elsif ( $line =~ /^\s*definition:\s*(.+)/ ) {
806 $def = $self->unescape($1);
807 $isobsolete = 1 if index($def,"OBSOLETE") == 0;
809 elsif ( $line =~ /^\s*definition_reference:\s*(.+)/ ) {
810 push( @def_refs, $self->unescape($1) );
812 elsif ( $line =~ /^\s*comment:\s*(.+)/ ) {
813 $comment = $self->unescape($1);
816 $self->_done( TRUE
) unless $line; # we'll come back until done
817 return $self->_create_ont_entry( $next_term, $termid, $def,
818 $comment, \
@def_refs, $isobsolete);
823 # Holds the GO engine to be parsed into
825 my ( $self, $value ) = @_;
827 if ( defined $value ) {
828 $self->{ "_ont_engine" } = $value;
831 return $self->{ "_ont_engine" };
835 # Used to create ontology terms.
836 # Arguments: name, id
837 sub _create_ont_entry
{
838 my ( $self, $name, $termid, $def, $cmt, $dbxrefs, $obsolete ) = @_;
840 if((!defined($obsolete)) && (index(lc($name),"obsolete") == 0)) {
843 my $anno = $self->_to_annotation($dbxrefs);
844 my $term = $self->term_factory->create_object(-name
=> $name,
845 -identifier
=> $termid,
849 -is_obsolete
=> $obsolete);
852 } # _create_ont_entry
856 # Holds whether first record or not
857 sub _not_first_record
{
858 my ( $self, $value ) = @_;
860 if ( defined $value ) {
861 $self->{ "_not_first_record" } = $value;
864 return $self->{ "_not_first_record" };
865 } # _not_first_record
869 # Holds whether done or not
871 my ( $self, $value ) = @_;
873 if ( defined $value ) {
874 $self->{ "_done" } = $value;
877 return $self->{ "_done" };
883 my ( $self, $value ) = @_;
885 if ( defined $value ) {
886 $self->{ "_term" } = $value;
889 return $self->{ "_term" };
892 # convert simple strings to Bio::Annotation::DBLinks
894 my ($self , $links) = @_;
895 return unless $links;
897 for my $string (@
{$links}) {
898 my ($db, $id) = split(':',$string);
899 push @dbxrefs, Bio
::Annotation
::DBLink
->new(-database
=> $db, -primary_id
=> $id);