2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms), " root terms, and ",
29 scalar($ont->get_all_terms), " total terms, and ",
30 scalar($ont->get_leaf_terms), " leaf terms\n";
35 Parser for OBO flat-file format. 'obo' example:
38 ontology: so/dev/externalDerived
39 property_value: owl:versionInfo "$Revision: 80 $" xsd:string
45 def: "A region of sequence, aligned to another sequence." []
50 def: "A part of a match." []
53 Specification: L<http://www.geneontology.org/GO.format.obo-1_2.shtml>.
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to the
61 Bioperl mailing lists Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
83 https://github.com/bioperl/bioperl-live/issues
89 Email: s-merchant@northwestern.edu
93 Northwestern University
94 Center for Genetic Medicine (CGM), dictyBase
101 Hilmar Lapp, hlapp at gmx.net
102 Chris Mungall, cjm at fruitfly.org
103 Brian Osborne, briano@bioteam.net
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
112 package Bio
::OntologyIO
::obo
;
117 use Bio
::Ontology
::OBOEngine
;
118 use Bio
::Ontology
::Ontology
;
119 use Bio
::Ontology
::OntologyStore
;
120 use Bio
::Ontology
::TermFactory
;
121 use Bio
::Annotation
::Collection
;
122 use Text
::Balanced
qw(extract_quotelike extract_bracketed);
124 use constant TRUE
=> 1;
125 use constant FALSE
=> 0;
127 use base
qw(Bio::OntologyIO);
132 Usage : $parser = Bio::OntologyIO->new(
134 -file => "gene_ontology.obo");
135 Function: Creates a new dagflat parser.
136 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
137 Args : -file => a single ontology flat file holding the
138 terms, descriptions and relationships
139 -ontology_name => the name of the ontology; if not specified the
140 parser will assign the name of the ontology as the
141 default-namespace header value from the OBO file.
142 -engine => the Bio::Ontology::OntologyEngineI object
143 to be reused (will be created otherwise); note
144 that every Bio::Ontology::OntologyI will
145 qualify as well since that one inherits from the
148 See L<Bio::OntologyIO>.
152 # Let OntologyIO::new() do the instantiation, and override
153 # _initialize for all initialization work
155 my ( $self, %arg ) = @_;
157 my ( $file, $name, $eng ) = $self->_rearrange(
166 $self->SUPER::_initialize
(%arg);
167 delete $self->{'_ontologies'};
169 # ontology engine (and possibly name if it's an OntologyI)
170 $eng = Bio
::Ontology
::OBOEngine
->new() unless $eng;
171 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
172 $self->ontology_name( $eng->name() );
173 $eng = $eng->engine() if $eng->can('engine');
175 $self->_ont_engine($eng);
177 $self->ontology_name($name) if $name;
182 Title : ontology_name
183 Usage : $obj->ontology_name($newval)
184 Function: Get/set the name of the ontology parsed by this module.
186 Returns : value of ontology_name (a scalar)
187 Args : on set, new value (a scalar or undef, optional)
194 return $self->{'ontology_name'} = shift if @_;
195 return $self->{'ontology_name'};
201 Usage : $parser->parse();
202 Function: Parses the files set with "new" or with methods
203 defs_file and _flat_files.
205 Normally you should not need to call this method as it will
206 be called automatically upon the first call to
209 Returns : Bio::Ontology::OntologyEngineI
217 # Setup the default term factory if not done by anyone yet
218 $self->term_factory(Bio
::Ontology
::TermFactory
->new( -type
=> "Bio::Ontology::OBOterm" ) )
219 unless $self->term_factory();
221 # Parse the file header
222 my $annotations_collection = $self->_header();
224 # Create the default ontology object itself
225 my $ont = Bio
::Ontology
::Ontology
->new(
226 -name
=> $self->ontology_name(),
227 -engine
=> $self->_ont_engine()
230 # Assign the file headers
231 $ont->annotation($annotations_collection);
233 # Set up the ontology of the relationship types
235 $self->_part_of_relationship(),
236 $self->_is_a_relationship(),
237 $self->_related_to_relationship(),
238 $self->_regulates_relationship(),
239 $self->_positively_regulates_relationship(),
240 $self->_negatively_regulates_relationship(),
246 $self->_add_ontology($ont);
249 while ( my $term = $self->_next_term() ) {
251 # Check if the terms has a valid ID and NAME otherwise ignore the term
252 if ( !$term->identifier() || !$term->name() ) {
253 $self->throw( "OBO File Format Error on line "
254 . $self->{'_current_line_no'}
255 . "\nThe term does not have a id/name tag. This term will be ignored."
260 my $new_ontology_flag = 1;
261 my $ontologies_array_ref = $self->{'_ontologies'};
263 for my $ontology ( @
$ontologies_array_ref ) {
264 my ($oname, $t_ns) = ( $ontology->name, $term->namespace );
265 next unless ( defined($oname) && defined($t_ns) );
266 if ( $oname eq $t_ns ) {
267 # No need to create new ontology
268 $new_ontology_flag = 0;
273 if ( $new_ontology_flag && $term->namespace ) {
274 my $new_ont = Bio
::Ontology
::Ontology
->new(
275 -name
=> $term->namespace,
276 -engine
=> $self->_ont_engine
278 $new_ont->annotation($annotations_collection);
279 $self->_add_ontology($new_ont);
283 $self->_add_term( $term, $ont );
285 # Adding the IS_A relationship
286 for my $parent_term ( @
{$self->{'_isa_parents'}} ) {
287 # Check if parent exists, if not then add the term to the graph.
288 if ( ! $self->_has_term($parent_term) ) {
289 $self->_add_term( $parent_term, $ont ); # !
292 $self->_add_relationship( $parent_term, $term,
293 $self->_is_a_relationship(), $ont );
296 # Adding the other relationships like part_of, related_to, develops_from
297 for my $relationship ( keys %{$self->{'_relationships'}} ) {
299 # Check if relationship exists, if not add it
300 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
301 $reltype = $self->_ont_engine->get_relationship_type($relationship);
304 $self->_ont_engine->add_relationship_type( $relationship, $ont );
305 $reltype = $self->_ont_engine->get_relationship_type($relationship);
308 # Check if the id already exists in the graph
309 for my $id ( @
{$self->{'_relationships'}->{$relationship}} ) {
310 my $parent_term = $self->_create_term_object();
311 $parent_term->identifier($id);
312 $parent_term->ontology($ont);
314 if ( ! $self->_has_term($parent_term) ) {
315 $self->_add_term( $parent_term, $ont );
318 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
323 return $self->_ont_engine();
328 Title : next_ontology
330 Function: Get the next available ontology from the parser. This is the
331 method prescribed by Bio::OntologyIO.
333 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
334 there is no more ontology in the input.
342 # Parse if not done already
343 $self->parse() unless exists( $self->{'_ontologies'} );
345 # Return next available ontology
346 if ( exists( $self->{'_ontologies'} ) ) {
347 my $ont = shift( @
{ $self->{'_ontologies'} } );
349 my $store = Bio
::Ontology
::OntologyStore
->new();
350 $store->register_ontology($ont);
362 Function: Closes this ontology stream and associated file handles.
364 Clients should call this method especially when they write
367 We need to override this here in order to close the file
368 handle for the term definitions file.
379 # first call the inherited implementation
380 $self->SUPER::close();
387 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
388 foreach my $ont (@_) {
390 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
391 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
393 # the ontology name may have been auto-discovered while parsing
395 $ont->name( $self->ontology_name ) unless $ont->name();
396 push( @
{ $self->{'_ontologies'} }, $ont );
400 # This simply delegates. See Ontology::OBOEngine::add_term.
402 my ( $self, $term, $ont ) = @_;
403 $term->ontology($ont) if $ont && ( !$term->ontology );
404 $self->_ont_engine()->add_term($term);
407 # This simply delegates. See OBOEngine
408 sub _part_of_relationship
{
411 return $self->_ont_engine()->part_of_relationship(@_);
414 # This simply delegates. See OBOEngine
415 sub _is_a_relationship
{
418 return $self->_ont_engine()->is_a_relationship(@_);
421 # This simply delegates. See OBOEngine
422 sub _related_to_relationship
{
425 return $self->_ont_engine()->related_to_relationship(@_);
428 # This simply delegates. See OBOEngine
429 sub _regulates_relationship
{
432 return $self->_ont_engine()->regulates_relationship(@_);
435 # This simply delegates. See OBOEngine
436 sub _positively_regulates_relationship
{
439 return $self->_ont_engine()->positively_regulates_relationship(@_);
442 # This simply delegates. See OBOEngine
443 sub _negatively_regulates_relationship
{
446 return $self->_ont_engine()->negatively_regulates_relationship(@_);
449 # This simply delegates. See OBOEngine
450 sub _add_relationship
{
451 my ( $self, $parent, $child, $type, $ont ) = @_;
453 # note the triple terminology (subject,predicate,object) corresponds to
454 # (child,type,parent)
455 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
458 # This simply delegates. See OBOEngine
462 return $self->_ont_engine()->has_term(@_);
465 # Holds the OBO engine to be parsed into
467 my ( $self, $value ) = @_;
469 if ( defined $value ) {
470 $self->{"_ont_engine"} = $value;
473 $self->{"_ont_engine"};
476 # Removes the escape chracters from the file
478 my ( $self, $line ) = @_;
481 $line =~ tr
[\200-\377] [\000-\177];
482 # see 'man perlop', section on tr/
483 # weird ascii characters should be excluded
484 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
485 # preserve \11 (9 - tab) and \12 (10-linefeed)
486 $line =~ tr/\13\14//d; # remove weird characters; 11,12
487 # preserve \15 (13 - carriage return)
488 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
489 $line =~ tr/\177//d; # remove DEL character
492 $line =~ s/[^\\]\!.*//;
493 $line =~ s/[^\\]\#.*//;
503 my $annotation_collection = Bio
::Annotation
::Collection
->new();
505 my $line_counter = 0;
506 $self->{'_current_line_no'} = 0;
507 my $format_version_header_flag = 0;
508 my $default_namespace_header_flag = 0;
510 while ( my $line = $self->_readline() ) {
512 my $line = $self->_filter_line($line);
515 if ( !$format_version_header_flag ) {
516 $self->throw("Format Error - Cannot find tag format-version." .
517 "This is required in header" );
520 $self->{'_current_line_no'} = $line_counter;
521 return $annotation_collection;
524 # Check if there is a header
525 if ( $line =~ /\[\w*\]/ ) {
526 $self->throw("Format Error - Cannot find tag format-version." .
527 "This is required in header." );
530 # If the line is not null, check it contains at least one colon
531 $self->_check_colon( $line, $line_counter );
533 # These are the allowed headers. Any other headers will be ignored
534 if ( $line =~ /^(\
[|format
-version
:
547 |default-relationship
-id
-prefix
:
552 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
553 ( $tag, $value ) = ( $1, $2 );
556 if ( $tag =~ /format-version/) {
557 $format_version_header_flag = 1;
558 }elsif( $tag =~ /default-namespace/ ) {
559 $default_namespace_header_flag = 1;
562 my $header = Bio
::Annotation
::SimpleValue
->new( -value
=> $value );
563 $annotation_collection->add_Annotation( $tag, $header );
565 # Assign the Ontology name as the value of the default-namespace header
566 if ( $tag =~ /default-namespace/i ) {
567 $self->ontology_name($value);
573 # Parses each stanza of the file
577 my $skip_stanza_flag = 1;
578 my $line_counter = $self->{'_current_line_no'};
580 while ( my $line = $self->_readline() ) {
582 my $line = $self->_filter_line($line);
584 if ( !$line && $term ) {
585 $self->{'_current_line_no'} = $line_counter;
589 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { # New stanza
590 if ( uc($1) eq "TERM" ) {
591 $term = $self->_create_term_object;
592 $skip_stanza_flag = 0;
594 # Reset the relationships after each stanza
595 $self->{'_relationships'} = {};
596 $self->{'_isa_parents'} = undef;
598 elsif ( uc($1) eq "TYPEDEF" ) {
599 $skip_stanza_flag = 1;
600 # Check if this typedef is already defined by the relationship
603 $skip_stanza_flag = 1;
604 $self->warn("OBO File Format Warning on line $line_counter $line\n"
605 . "Unrecognized stanza type found. Skipping this stanza." );
610 # If the line is not null, check it contains at least one colon
611 $self->_check_colon( $line, $line_counter );
613 # If there is any tag value other than the list below move to the next tag
614 next if (( $line !~ /^(\
[|id
:
635 ) || $skip_stanza_flag );
638 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
639 my ( $tag, $val ) = ( $1, $2 );
641 # If no value for the tag throw a warning
643 $self->warn("OBO File Format Warning on line $line_counter $line\n" .
649 ( $val, $qh ) = $self->_extract_quals($val);
653 if ( $tag eq "ID" ) {
654 $term->identifier($val);
655 if ( $self->_has_term($term) ) {
656 $term = $self->_ont_engine()->get_terms($val);
659 elsif ( $tag eq "NAME" ) {
662 elsif ( $tag eq "XREF_ANALOG" ) {
663 if ( !$term->has_dbxref($val) ) {
664 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation( [$val] ) );
667 elsif ( $tag eq "XREF_UNKNOWN" ) {
668 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation( [$val] ) );
670 elsif ( $tag eq "NAMESPACE" ) {
671 $term->namespace($val);
673 elsif ( $tag eq "DEF" ) {
674 my ( $defstr, $parts ) = $self->_extract_qstr($val);
675 $term->definition($defstr);
676 my $ann = $self->_to_annotation($parts);
677 $term->add_dbxref( -dbxrefs
=> $ann );
679 elsif ( $tag eq "SYNONYM" ) {
680 $term->add_synonym($val);
682 elsif ( $tag eq "ALT_ID" ) {
683 $term->add_secondary_id($val);
685 elsif ( $tag =~ /XREF/i ) {
686 $term->add_secondary_id($val);
688 elsif ( $tag eq "IS_OBSOLETE" ) {
689 if ( $val eq 'true' ) {
692 elsif ( $val eq 'false' ) {
695 $term->is_obsolete($val);
697 elsif ( $tag eq "COMMENT" ) {
698 $term->comment($val);
700 elsif ( $tag eq "RELATIONSHIP" ) {
701 $self->_handle_relationship_tag($val);
703 elsif ( $tag eq "IS_A" ) {
705 my $parent_term = $self->_create_term_object();
706 $parent_term->identifier($val);
707 push @
{ $self->{'_isa_parents'} }, $parent_term;
716 # Creates a Bio::Ontology::OBOterm object
717 sub _create_term_object
{
719 my $term = $self->term_factory->create_object();
724 my ( $self, $str ) = @_;
727 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
731 my @qparts = $self->_split_on_comma($extr);
733 if (/(\w+)=\"(.*)\"/) {
736 elsif (/(\w+)=\'(.*)\'/) {
744 return ( $return_str, \
%q );
752 my ( $self, $str ) = @_;
754 my ( $extr, $rem, $prefix ) = extract_quotelike
($str);
759 warn("illegal prefix: $prefix in: $str");
764 # e.g. synonym: "foo" EXACT [...]
765 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
767 push( @extra, split( ' ', $1 ) );
771 while ( ( $extr, $rem, $prefix ) = extract_bracketed
( $rem, '[]' ) ) {
775 push( @parts, $extr ) if $extr;
778 map { $self->_split_on_comma($_) } @parts;
782 ( $txt, \
@parts, \
@extra );
785 sub _split_on_comma
{
786 my ( $self, $str ) = @_;
788 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
791 unshift( @parts, $part );
794 unshift( @parts, $str );
796 return map { s/\\//g; $_ } @parts;
799 # This method checks for an existing colon in a line
801 my ( $self, $line, $line_no ) = @_;
802 if ( $line && !( $line =~ /:/ ) ) {
803 $self->throw("OBO File Format Error on line $line_no $line\n" .
804 "Cannot find key-terminating colon"
809 # This method handles relationship tags
810 sub _handle_relationship_tag
{
811 my ( $self, $val ) = @_;
812 my @parts = split( / /, $val );
813 my $relationship = uc($parts[0]);
814 my $id = $parts[1] =~ /\^(w+)\s+\!/ ?
$1 : $parts[1];
815 my $parent_term = $self->_create_term_object();
816 $parent_term->identifier($id);
818 if ( my $realtionships_hash = $self->{'_relationships'} ) {
819 my $id_array_ref = $$realtionships_hash{$relationship};
820 if ( !$id_array_ref ) {
823 $$realtionships_hash{$relationship} = \
@ids;
827 push( @
$id_array_ref, $id );
832 # Convert simple strings to Bio::Annotation::DBLinks
834 my ($self , $links) = @_;
835 return unless $links;
837 for my $string (@
{$links}) {
838 my ($db, $id) = split(':',$string);
839 push @dbxrefs, Bio
::Annotation
::DBLink
->new(-database
=> $db, -primary_id
=> $id);