2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
15 Bio::OntologyIO::obo - a parser for OBO flat-file format from Gene Ontology Consortium
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms)," root terms, and ",
29 scalar($ont->get_all_terms)," total terms, and ",
30 scalar($ont->get_leaf_terms)," leaf terms\n";
36 Needs Graph.pm from CPAN.
42 User feedback is an integral part of the evolution of this and other
43 Bioperl modules. Send your comments and suggestions preferably to the
44 Bioperl mailing lists Your participation is much appreciated.
46 bioperl-l@bioperl.org - General discussion
47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 Please direct usage questions or support issues to the mailing list:
53 I<bioperl-l@bioperl.org>
55 rather than to the module maintainer directly. Many experienced and
56 reponsive experts will be able look at the problem and quickly
57 address it. Please include a thorough description of the problem
58 with code and data examples if at all possible.
62 Report bugs to the Bioperl bug tracking system to help us keep track
63 the bugs and their resolution. Bug reports can be submitted via the
66 http://bugzilla.open-bio.org/
72 Email: s-merchant@northwestern.edu
77 Northwestern University
78 Center for Genetic Medicine (CGM), dictyBase
85 Hilmar Lapp, hlapp at gmx.net
86 Chris Mungall, cjm at fruitfly.org
90 The rest of the documentation details each of the object
91 methods. Internal methods are usually preceded with a _
95 package Bio
::OntologyIO
::obo
;
100 use Bio
::Ontology
::OBOEngine
;
101 use Bio
::Ontology
::Ontology
;
102 use Bio
::Ontology
::OntologyStore
;
103 use Bio
::Ontology
::TermFactory
;
104 use Bio
::Annotation
::Collection
;
105 use Text
::Balanced
qw(extract_quotelike extract_bracketed);
107 use constant TRUE
=> 1;
108 use constant FALSE
=> 0;
110 use base
qw(Bio::OntologyIO);
115 Usage : $parser = Bio::OntologyIO->new(
117 -file => "gene_ontology.obo");
118 Function: Creates a new dagflat parser.
119 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
120 Args : -file => a single ontology flat file holding the
121 terms, descriptions and relationships
122 -ontology_name => the name of the ontology; if not specified the
123 parser will assign the name of the ontology as the
124 default-namespace header value from the OBO file.
125 -engine => the Bio::Ontology::OntologyEngineI object
126 to be reused (will be created otherwise); note
127 that every Bio::Ontology::OntologyI will
128 qualify as well since that one inherits from the
131 See L<Bio::OntologyIO>.
135 # in reality, we let OntologyIO::new do the instantiation, and override
136 # _initialize for all initialization work
138 my ( $self, %arg ) = @_;
140 my ( $file, $name, $eng ) = $self->_rearrange(
149 $self->SUPER::_initialize
(%arg);
150 delete $self->{'_ontologies'};
152 # ontology engine (and possibly name if it's an OntologyI)
153 $eng = Bio
::Ontology
::OBOEngine
->new() unless $eng;
154 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
155 $self->ontology_name( $eng->name() );
156 $eng = $eng->engine() if $eng->can('engine');
158 $self->_ont_engine($eng);
160 $self->ontology_name($name) if $name;
166 Title : ontology_name
167 Usage : $obj->ontology_name($newval)
168 Function: Get/set the name of the ontology parsed by this module.
170 Returns : value of ontology_name (a scalar)
171 Args : on set, new value (a scalar or undef, optional)
179 return $self->{'ontology_name'} = shift if @_;
180 return $self->{'ontology_name'};
186 Usage : $parser->parse();
187 Function: Parses the files set with "new" or with methods
188 defs_file and _flat_files.
190 Normally you should not need to call this method as it will
191 be called automatically upon the first call to
194 Returns : Bio::Ontology::OntologyEngineI
202 # setup the default term factory if not done by anyone yet
204 Bio
::Ontology
::TermFactory
->new( -type
=> "Bio::Ontology::OBOterm" ) )
205 unless $self->term_factory();
207 ## Parse the file header
208 my $annotations_collection = $self->_header();
210 # create the default ontology object itself
211 my $ont = Bio
::Ontology
::Ontology
->new(
212 -name
=> $self->ontology_name(),
213 -engine
=> $self->_ont_engine()
216 ## Assign the file headers
217 $ont->annotation($annotations_collection);
219 # set up the ontology of the relationship types
221 $self->_part_of_relationship(),
222 $self->_is_a_relationship(),
223 $self->_related_to_relationship(),
224 $self->_regulates_relationship(),
225 $self->_positively_regulates_relationship(),
226 $self->_negatively_regulates_relationship(),
232 ##################################
233 $self->_add_ontology($ont);
234 ##################################
237 while ( my $term = $self->_next_term() ) {
239 ### CHeck if the terms has a valid ID and NAME otherwise ignore the term
240 if ( !$term->identifier() || !$term->name() ) {
241 $self->throw( "OBO File Format Error on line "
242 . $self->{'_current_line_no'}
243 . " \nThe term does not have a id/name tag. This term will be ignored.\n"
248 #print $term->identifier(),"\t",$term->name(),"\n";
250 my $new_ontology_flag = 1;
251 my $ontologies_array_ref = $self->{'_ontologies'};
252 foreach my $ontology (@
$ontologies_array_ref) {
253 my ($oname, $t_ns) = ($ontology->name(), $term->namespace() );
254 next unless (defined($oname) && defined($t_ns));
255 if ( $oname eq $t_ns ) {
256 ### No need to create new ontology
257 $new_ontology_flag = 0;
262 if ( $new_ontology_flag && $term->namespace() ) {
263 my $new_ont = Bio
::Ontology
::Ontology
->new(
264 -name
=> $term->namespace(),
265 -engine
=> $self->_ont_engine()
267 $new_ont->annotation($annotations_collection);
268 $self->_add_ontology($new_ont);
273 $self->_add_term( $term, $ont );
275 #### Addding the IS_A relationship
276 my $isa_parents_array_ref = $self->{'_isa_parents'};
277 foreach my $parent_term (@
$isa_parents_array_ref) {
278 ### Check if parent exist, if not then add the term to the graph.
279 if ( !( $self->_has_term($parent_term) ) ) {
280 $self->_add_term( $parent_term, $ont );
283 $self->_add_relationship( $parent_term, $term,
284 $self->_is_a_relationship(), $ont );
287 #### Addding the other relationships like part_of, realted_to, develpos_from
288 my $relationship_hash_ref = $self->{'_relationships'};
289 foreach my $relationship ( keys %$relationship_hash_ref ) {
291 #### Check if relationship exist, if not add it.
292 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
294 $self->_ont_engine->get_relationship_type($relationship);
297 $self->_ont_engine->add_relationship_type( $relationship,
300 $self->_ont_engine->get_relationship_type($relationship);
303 #### Check if the id already exist in the graph
304 my $id_array_ref = $$relationship_hash_ref{$relationship};
305 foreach my $id (@
$id_array_ref) {
306 my $parent_term = $self->_create_term_object();
307 $parent_term->identifier($id);
308 $parent_term->ontology($ont);
310 if ( !( $self->_has_term($parent_term) ) ) {
311 $self->_add_term( $parent_term, $ont );
314 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
320 return $self->_ont_engine();
325 Title : next_ontology
327 Function: Get the next available ontology from the parser. This is the
328 method prescribed by Bio::OntologyIO.
330 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
331 there is no more ontology in the input.
340 # parse if not done already
341 $self->parse() unless exists( $self->{'_ontologies'} );
343 # return next available ontology
344 if ( exists( $self->{'_ontologies'} ) ) {
345 my $ont = shift( @
{ $self->{'_ontologies'} } );
347 my $store = Bio
::Ontology
::OntologyStore
->new();
348 $store->register_ontology($ont);
360 Function: Closes this ontology stream and associated file handles.
362 Clients should call this method especially when they write
365 We need to override this here in order to close the file
366 handle for the term definitions file.
378 # first call the inherited implementation
379 $self->SUPER::close();
387 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
388 foreach my $ont (@_) {
390 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
391 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
393 # the ontology name may have been auto-discovered while parsing
395 $ont->name( $self->ontology_name ) unless $ont->name();
396 push( @
{ $self->{'_ontologies'} }, $ont );
400 # This simply delegates. See OBOEngine.
402 my ( $self, $term, $ont ) = @_;
403 $term->ontology($ont) if $ont && ( !$term->ontology );
404 $self->_ont_engine()->add_term($term);
407 # This simply delegates. See OBOEngine
408 sub _part_of_relationship
{
411 return $self->_ont_engine()->part_of_relationship(@_);
412 } # _part_of_relationship
414 # This simply delegates. See OBOEngine
415 sub _is_a_relationship
{
418 return $self->_ont_engine()->is_a_relationship(@_);
419 } # _is_a_relationship
421 # This simply delegates. See OBOEngine
422 sub _related_to_relationship
{
425 return $self->_ont_engine()->related_to_relationship(@_);
426 } # _is_a_relationship
429 # This simply delegates. See OBOEngine
430 sub _regulates_relationship
{
433 return $self->_ont_engine()->regulates_relationship(@_);
434 } # _part_of_relationship
436 # This simply delegates. See OBOEngine
437 sub _positively_regulates_relationship
{
440 return $self->_ont_engine()->positively_regulates_relationship(@_);
441 } # _part_of_relationship
444 # This simply delegates. See OBOEngine
445 sub _negatively_regulates_relationship
{
448 return $self->_ont_engine()->negatively_regulates_relationship(@_);
449 } # _part_of_relationship
451 # This simply delegates. See OBOEngine
452 sub _add_relationship
{
453 my ( $self, $parent, $child, $type, $ont ) = @_;
455 # note the triple terminology (subject,predicate,object) corresponds to
456 # (child,type,parent)
457 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
459 } # _add_relationship
461 # This simply delegates. See OBOEngine
465 return $self->_ont_engine()->has_term(@_);
468 # Holds the OBO engine to be parsed into
470 my ( $self, $value ) = @_;
472 if ( defined $value ) {
473 $self->{"_ont_engine"} = $value;
476 return $self->{"_ont_engine"};
479 # Removes the escape chracters from the file
481 my ( $self, $line ) = @_;
484 $line =~ tr
[\200-\377]
485 [\000-\177]; # see 'man perlop', section on tr/
486 # weird ascii characters should be excluded
487 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
488 # preserve \11 (9 - tab) and \12 (10-linefeed)
489 $line =~ tr/\13\14//d; # remove weird characters; 11,12
490 # preserve \15 (13 - carriage return)
491 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
492 $line =~ tr/\177//d; # remove DEL character
495 $line =~ s/[^\\]\!.*//;
496 $line =~ s/[^\\]\#.*//;
506 my $annotation_collection = Bio
::Annotation
::Collection
->new();
508 my $line_counter = 0;
509 $self->{'_current_line_no'} = 0;
510 my $format_version_header_flag = 0;
511 my $default_namespace_header_flag = 0;
513 while ( my $line = $self->_readline() ) {
515 my $line = $self->_filter_line($line);
517 if ( !$format_version_header_flag || !$default_namespace_header_flag) {
519 "OBO File Format Error - \nCannot find tag format-version and/ default-namespace . These are required header.\n"
523 $self->{'_current_line_no'} = $line_counter;
524 return $annotation_collection;
527 ### CHeck if there is a header
528 if($line =~ /\[\w*\]/) {
530 "OBO File Format Error - \nCannot find tag format-version. Thi ia a required header.\n"
535 ### If the line is not null, check it contains atleasdt one colon
536 $self->_check_colon( $line, $line_counter );
538 ### Thsse ar the allowed headers. Any other headers will be ignored
540 /^(\[|format-version:|typeref:|version:|date:|saved-by:|auto-generated-by:|default-namespace:|remark:|subsetdef:)/
543 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
544 ( $tag, $value ) = ( $1, $2 );
547 if ( $tag =~ /format-version/) {
548 $format_version_header_flag = 1;
549 }elsif( $tag =~ /default-namespace/ ) {
550 $default_namespace_header_flag = 1;
553 my $header = Bio
::Annotation
::SimpleValue
->new( -value
=> $value );
554 $annotation_collection->add_Annotation( $tag, $header );
556 #### Assign the Ontology name as the value of the default-namespace header
557 if ( $tag =~ /default-namespace/i ) {
559 $self->ontology_name($value);
568 ### Parses each stanza of the file
572 my $skip_stanza_flag = 1;
573 my $line_counter = $self->{'_current_line_no'};
575 while ( my $line = $self->_readline() ) {
578 my $line = $self->_filter_line($line);
579 if ( !$line && $term ) {
580 $self->{'_current_line_no'} = $line_counter;
584 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { #New stanza
586 if ( uc($1) eq "TERM" ) {
588 $term = $self->_create_term_object;
589 $skip_stanza_flag = 0;
590 ### Reset the relationships after each stanza
591 $self->{'_relationships'} = {};
592 $self->{'_isa_parents'} = undef;
594 elsif ( uc($1) eq "TYPEDEF" ) {
595 $skip_stanza_flag = 1;
596 ### Check if this typedef is already defined by the relationship
599 $skip_stanza_flag = 1;
601 "OBO File Format Warning on line $line_counter $line \nUnrecognized stanza type found. Skipping this stanza.\n"
607 ### If the line is not null, check it contains atleasdt one colon
608 $self->_check_colon( $line, $line_counter );
610 ### if there is any tag value other thn the list below move to the next tag
615 /^(\[|id:|name:|is_a:|relationship:|namespace:|is_obsolete:|alt_id:|def:|xref_analog:|exact_synonym:|broad_synonym:|related_synonym:|synonym:|comment:|xref:)/
620 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) { #TAg Value pair
621 my ( $tag, $val ) = ( $1, $2 );
623 ### If no value for the tag thrown a warning
626 "OBO File Format Warning on line $line_counter $line \nTag has no value\n"
631 ( $val, $qh ) = $self->_extract_quals($val);
635 if ( $tag eq "ID" ) {
637 $term->identifier($val);
638 if ( $self->_has_term($term) ) {
639 $term = $self->_ont_engine()->get_terms($val);
643 elsif ( $tag eq "NAME" ) {
646 elsif ( $tag eq "XREF_ANALOG" ) {
647 if ( !$term->has_dbxref($val) ) {
648 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation([$val]));
651 elsif ( $tag eq "XREF_UNKNOWN" ) {
652 $term->add_dbxref(-dbxrefs
=> $self->_to_annotation([$val]));
654 elsif ( $tag eq "NAMESPACE" ) {
655 $term->namespace($val);
657 elsif ( $tag eq "DEF" ) {
658 my ( $defstr, $parts ) = $self->_extract_qstr($val);
659 $term->definition($defstr);
660 my $ann = $self->_to_annotation($parts);
661 $term->add_dbxref(-dbxrefs
=> $ann);
663 elsif ( $tag =~ /(\w*)synonym/i ) {
664 #$val =~ s/['"\[\]]//g; #NML commented out b/c need quotes
665 $term->add_synonym($val);
667 elsif ( $tag eq "ALT_ID" ) {
668 $term->add_secondary_id($val);
670 elsif ( $tag =~ /XREF/i ) {
671 $term->add_secondary_id($val);
673 elsif ( $tag eq "IS_OBSOLETE" ) {
675 if ( $val eq 'true' ) {
678 if ( $val eq 'false' ) {
681 $term->is_obsolete($val);
683 elsif ( $tag eq "COMMENT" ) {
684 $term->comment($val);
686 elsif ( $tag eq "RELATIONSHIP" ) {
687 $self->_handle_relationship_tag($val);
689 elsif ( $tag eq "IS_A" ) {
692 my $parent_term = $self->_create_term_object();
693 $parent_term->identifier($val);
695 if ( $self->{'_isa_parents'} ) {
696 my $isa_parents_array_ref = $self->{'_isa_parents'};
697 push( @
$isa_parents_array_ref, $parent_term );
701 push( @terms_array, $parent_term );
702 $self->{'_isa_parents'} = \
@terms_array;
710 # Creates a Bio::Ontology::OBOterm object
711 sub _create_term_object
{
714 my $term = $self->term_factory->create_object();
721 my ( $self, $str ) = @_;
724 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
728 my @qparts = $self->_split_on_comma($extr);
730 if (/(\w+)=\"(.*)\"/) {
733 elsif (/(\w+)=\'(.*)\'/) {
741 return ( $return_str, \
%q );
749 my ( $self, $str ) = @_;
751 my ( $extr, $rem, $prefix ) = extract_quotelike
($str);
756 warn("illegal prefix: $prefix in: $str");
761 # eg synonym: "foo" EXACT [...]
762 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
764 push( @extra, split( ' ', $1 ) );
768 while ( ( $extr, $rem, $prefix ) = extract_bracketed
( $rem, '[]' ) ) {
772 push( @parts, $extr ) if $extr;
775 map { $self->_split_on_comma($_) } @parts;
778 return ( $txt, \
@parts, \
@extra );
781 sub _split_on_comma
{
782 my ( $self, $str ) = @_;
784 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
787 unshift( @parts, $part );
790 unshift( @parts, $str );
791 return map { s/\\//g; $_ } @parts;
794 # This method checks for an existing colon in a line
796 my ( $self, $line, $line_no ) = @_;
797 if ( $line && !( $line =~ /:/ ) ) {
799 "OBO File Format Error on line $line_no $line - \nCannot find key-terminating colon\n"
804 # This method handles relationship tags
805 sub _handle_relationship_tag
{
806 my ( $self, $val ) = @_;
807 my @parts = split( / /, $val );
808 my $relationship = uc($parts[0]);
809 my $id = $parts[1] =~ /\^(w+)\s+\!/ ?
$1 : $parts[1];
810 my $parent_term = $self->_create_term_object();
811 $parent_term->identifier($id);
813 if ( my $realtionships_hash = $self->{'_relationships'} ) {
814 my $id_array_ref = $$realtionships_hash{$relationship};
815 if ( !$id_array_ref ) {
818 $$realtionships_hash{$relationship} = \
@ids;
822 push( @
$id_array_ref, $id );
829 # convert simple strings to Bio::Annotation::DBLinks
831 my ($self , $links) = @_;
832 return unless $links;
834 for my $string (@
{$links}) {
835 my ($db, $id) = split(':',$string);
836 push @dbxrefs, Bio
::Annotation
::DBLink
->new(-database
=> $db, -primary_id
=> $id);