Couple more minor edits ...
[bioperl-live.git] / Bio / OntologyIO / obo.pm
blob9b3741a7e1a9a8c32807105d1cdb7d35f57e0241
2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
13 =head1 NAME
15 Bio::OntologyIO::obo
17 =head1 SYNOPSIS
19 use Bio::OntologyIO;
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
23 ( -format => "obo",
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms), " root terms, and ",
29 scalar($ont->get_all_terms), " total terms, and ",
30 scalar($ont->get_leaf_terms), " leaf terms\n";
33 =head1 DESCRIPTION
35 Parser for OBO flat-file format. 'obo' example:
37 format-version: 1.2
38 ontology: so/dev/externalDerived
39 property_value: owl:versionInfo "$Revision: 80 $" xsd:string
40 default-namespace: SO
42 [Term]
43 id: SO_0000343
44 name: match
45 def: "A region of sequence, aligned to another sequence." []
47 [Term]
48 id: SO_0000039
49 name: match_part
50 def: "A part of a match." []
51 is_a: SO_0000343
53 Specification: L<http://www.geneontology.org/GO.format.obo-1_2.shtml>.
55 =head1 FEEDBACK
57 =head2 Mailing Lists
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to the
61 Bioperl mailing lists Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 =head2 Support
68 Please direct usage questions or support issues to the mailing list:
70 I<bioperl-l@bioperl.org>
72 rather than to the module maintainer directly. Many experienced and
73 reponsive experts will be able look at the problem and quickly
74 address it. Please include a thorough description of the problem
75 with code and data examples if at all possible.
77 =head2 Reporting Bugs
79 Report bugs to the Bioperl bug tracking system to help us keep track
80 the bugs and their resolution. Bug reports can be submitted via the
81 web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR
87 Sohel Merchant
89 Email: s-merchant@northwestern.edu
91 Address:
93 Northwestern University
94 Center for Genetic Medicine (CGM), dictyBase
95 Suite 1206,
96 676 St. Clair st
97 Chicago IL 60611
99 =head2 CONTRIBUTOR
101 Hilmar Lapp, hlapp at gmx.net
102 Chris Mungall, cjm at fruitfly.org
103 Brian Osborne, briano@bioteam.net
105 =head1 APPENDIX
107 The rest of the documentation details each of the object
108 methods. Internal methods are usually preceded with a _
110 =cut
112 package Bio::OntologyIO::obo;
114 use strict;
116 use Bio::Root::IO;
117 use Bio::Ontology::OBOEngine;
118 use Bio::Ontology::Ontology;
119 use Bio::Ontology::OntologyStore;
120 use Bio::Ontology::TermFactory;
121 use Bio::Annotation::Collection;
122 use Text::Balanced qw(extract_quotelike extract_bracketed);
124 use constant TRUE => 1;
125 use constant FALSE => 0;
127 use base qw(Bio::OntologyIO);
129 =head2 new
131 Title : new
132 Usage : $parser = Bio::OntologyIO->new(
133 -format => "obo",
134 -file => "gene_ontology.obo");
135 Function: Creates a new dagflat parser.
136 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
137 Args : -file => a single ontology flat file holding the
138 terms, descriptions and relationships
139 -ontology_name => the name of the ontology; if not specified the
140 parser will assign the name of the ontology as the
141 default-namespace header value from the OBO file.
142 -engine => the Bio::Ontology::OntologyEngineI object
143 to be reused (will be created otherwise); note
144 that every Bio::Ontology::OntologyI will
145 qualify as well since that one inherits from the
146 former.
148 See L<Bio::OntologyIO>.
150 =cut
152 # Let OntologyIO::new() do the instantiation, and override
153 # _initialize for all initialization work
154 sub _initialize {
155 my ( $self, %arg ) = @_;
157 my ( $file, $name, $eng ) = $self->_rearrange(
159 qw( FILE
160 ONTOLOGY_NAME
161 ENGINE)
163 %arg
166 $self->SUPER::_initialize(%arg);
167 delete $self->{'_ontologies'};
169 # ontology engine (and possibly name if it's an OntologyI)
170 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
171 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
172 $self->ontology_name( $eng->name() );
173 $eng = $eng->engine() if $eng->can('engine');
175 $self->_ont_engine($eng);
177 $self->ontology_name($name) if $name;
180 =head2 ontology_name
182 Title : ontology_name
183 Usage : $obj->ontology_name($newval)
184 Function: Get/set the name of the ontology parsed by this module.
185 Example :
186 Returns : value of ontology_name (a scalar)
187 Args : on set, new value (a scalar or undef, optional)
189 =cut
191 sub ontology_name {
192 my $self = shift;
194 return $self->{'ontology_name'} = shift if @_;
195 return $self->{'ontology_name'};
198 =head2 parse
200 Title : parse()
201 Usage : $parser->parse();
202 Function: Parses the files set with "new" or with methods
203 defs_file and _flat_files.
205 Normally you should not need to call this method as it will
206 be called automatically upon the first call to
207 next_ontology().
209 Returns : Bio::Ontology::OntologyEngineI
210 Args :
212 =cut
214 sub parse {
215 my $self = shift;
217 # Setup the default term factory if not done by anyone yet
218 $self->term_factory(Bio::Ontology::TermFactory->new( -type => "Bio::Ontology::OBOterm" ) )
219 unless $self->term_factory();
221 # Parse the file header
222 my $annotations_collection = $self->_header();
224 # Create the default ontology object itself
225 my $ont = Bio::Ontology::Ontology->new(
226 -name => $self->ontology_name(),
227 -engine => $self->_ont_engine()
230 # Assign the file headers
231 $ont->annotation($annotations_collection);
233 # Set up the ontology of the relationship types
234 for (
235 $self->_part_of_relationship(),
236 $self->_is_a_relationship(),
237 $self->_related_to_relationship(),
238 $self->_regulates_relationship(),
239 $self->_positively_regulates_relationship(),
240 $self->_negatively_regulates_relationship(),
243 $_->ontology($ont);
246 $self->_add_ontology($ont);
248 # Adding new terms
249 while ( my $term = $self->_next_term() ) {
251 # Check if the terms has a valid ID and NAME otherwise ignore the term
252 if ( !$term->identifier() || !$term->name() ) {
253 $self->throw( "OBO File Format Error on line "
254 . $self->{'_current_line_no'}
255 . "\nThe term does not have a id/name tag. This term will be ignored."
257 next;
260 my $new_ontology_flag = 1;
261 my $ontologies_array_ref = $self->{'_ontologies'};
263 for my $ontology ( @$ontologies_array_ref ) {
264 my ($oname, $t_ns) = ( $ontology->name, $term->namespace );
265 next unless ( defined($oname) && defined($t_ns) );
266 if ( $oname eq $t_ns ) {
267 # No need to create new ontology
268 $new_ontology_flag = 0;
269 $ont = $ontology;
273 if ( $new_ontology_flag && $term->namespace ) {
274 my $new_ont = Bio::Ontology::Ontology->new(
275 -name => $term->namespace,
276 -engine => $self->_ont_engine
278 $new_ont->annotation($annotations_collection);
279 $self->_add_ontology($new_ont);
280 $ont = $new_ont;
283 $self->_add_term( $term, $ont );
285 # Adding the IS_A relationship
286 for my $parent_term ( @{$self->{'_isa_parents'}} ) {
287 # Check if parent exists, if not then add the term to the graph.
288 if ( ! $self->_has_term($parent_term) ) {
289 $self->_add_term( $parent_term, $ont ); # !
292 $self->_add_relationship( $parent_term, $term,
293 $self->_is_a_relationship(), $ont );
296 # Adding the other relationships like part_of, related_to, develops_from
297 for my $relationship ( keys %{$self->{'_relationships'}} ) {
298 my $reltype;
299 # Check if relationship exists, if not add it
300 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
301 $reltype = $self->_ont_engine->get_relationship_type($relationship);
303 else {
304 $self->_ont_engine->add_relationship_type( $relationship, $ont );
305 $reltype = $self->_ont_engine->get_relationship_type($relationship);
308 # Check if the id already exists in the graph
309 for my $id ( @{$self->{'_relationships'}->{$relationship}} ) {
310 my $parent_term = $self->_create_term_object();
311 $parent_term->identifier($id);
312 $parent_term->ontology($ont);
314 if ( ! $self->_has_term($parent_term) ) {
315 $self->_add_term( $parent_term, $ont );
318 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
323 return $self->_ont_engine();
326 =head2 next_ontology
328 Title : next_ontology
329 Usage :
330 Function: Get the next available ontology from the parser. This is the
331 method prescribed by Bio::OntologyIO.
332 Example :
333 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
334 there is no more ontology in the input.
335 Args :
337 =cut
339 sub next_ontology {
340 my $self = shift;
342 # Parse if not done already
343 $self->parse() unless exists( $self->{'_ontologies'} );
345 # Return next available ontology
346 if ( exists( $self->{'_ontologies'} ) ) {
347 my $ont = shift( @{ $self->{'_ontologies'} } );
348 if ($ont) {
349 my $store = Bio::Ontology::OntologyStore->new();
350 $store->register_ontology($ont);
352 return $ont;
355 return;
358 =head2 close
360 Title : close
361 Usage :
362 Function: Closes this ontology stream and associated file handles.
364 Clients should call this method especially when they write
365 ontologies.
367 We need to override this here in order to close the file
368 handle for the term definitions file.
370 Example :
371 Returns : none
372 Args : none
374 =cut
376 sub close {
377 my $self = shift;
379 # first call the inherited implementation
380 $self->SUPER::close();
383 # INTERNAL METHODS
385 sub _add_ontology {
386 my $self = shift;
387 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
388 foreach my $ont (@_) {
389 $self->throw(
390 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
391 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
393 # the ontology name may have been auto-discovered while parsing
394 # the file
395 $ont->name( $self->ontology_name ) unless $ont->name();
396 push( @{ $self->{'_ontologies'} }, $ont );
400 # This simply delegates. See Ontology::OBOEngine::add_term.
401 sub _add_term {
402 my ( $self, $term, $ont ) = @_;
403 $term->ontology($ont) if $ont && ( !$term->ontology );
404 $self->_ont_engine()->add_term($term);
407 # This simply delegates. See OBOEngine
408 sub _part_of_relationship {
409 my $self = shift;
411 return $self->_ont_engine()->part_of_relationship(@_);
414 # This simply delegates. See OBOEngine
415 sub _is_a_relationship {
416 my $self = shift;
418 return $self->_ont_engine()->is_a_relationship(@_);
421 # This simply delegates. See OBOEngine
422 sub _related_to_relationship {
423 my $self = shift;
425 return $self->_ont_engine()->related_to_relationship(@_);
428 # This simply delegates. See OBOEngine
429 sub _regulates_relationship {
430 my $self = shift;
432 return $self->_ont_engine()->regulates_relationship(@_);
435 # This simply delegates. See OBOEngine
436 sub _positively_regulates_relationship {
437 my $self = shift;
439 return $self->_ont_engine()->positively_regulates_relationship(@_);
442 # This simply delegates. See OBOEngine
443 sub _negatively_regulates_relationship {
444 my $self = shift;
446 return $self->_ont_engine()->negatively_regulates_relationship(@_);
449 # This simply delegates. See OBOEngine
450 sub _add_relationship {
451 my ( $self, $parent, $child, $type, $ont ) = @_;
453 # note the triple terminology (subject,predicate,object) corresponds to
454 # (child,type,parent)
455 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
458 # This simply delegates. See OBOEngine
459 sub _has_term {
460 my $self = shift;
462 return $self->_ont_engine()->has_term(@_);
465 # Holds the OBO engine to be parsed into
466 sub _ont_engine {
467 my ( $self, $value ) = @_;
469 if ( defined $value ) {
470 $self->{"_ont_engine"} = $value;
473 $self->{"_ont_engine"};
476 # Removes the escape chracters from the file
477 sub _filter_line {
478 my ( $self, $line ) = @_;
480 chomp($line);
481 $line =~ tr [\200-\377] [\000-\177];
482 # see 'man perlop', section on tr/
483 # weird ascii characters should be excluded
484 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
485 # preserve \11 (9 - tab) and \12 (10-linefeed)
486 $line =~ tr/\13\14//d; # remove weird characters; 11,12
487 # preserve \15 (13 - carriage return)
488 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
489 $line =~ tr/\177//d; # remove DEL character
491 $line =~ s/^\!.*//;
492 $line =~ s/[^\\]\!.*//;
493 $line =~ s/[^\\]\#.*//;
494 $line =~ s/^\s+//;
495 $line =~ s/\s+$//;
497 return $line;
500 # Parses the header
501 sub _header {
502 my $self = shift;
503 my $annotation_collection = Bio::Annotation::Collection->new();
504 my ( $tag, $value );
505 my $line_counter = 0;
506 $self->{'_current_line_no'} = 0;
507 my $format_version_header_flag = 0;
508 my $default_namespace_header_flag = 0;
510 while ( my $line = $self->_readline() ) {
511 ++$line_counter;
512 my $line = $self->_filter_line($line);
514 if ( !$line ) {
515 if ( !$format_version_header_flag ) {
516 $self->throw("Format Error - Cannot find tag format-version." .
517 "This is required in header" );
520 $self->{'_current_line_no'} = $line_counter;
521 return $annotation_collection;
524 # Check if there is a header
525 if ( $line =~ /\[\w*\]/ ) {
526 $self->throw("Format Error - Cannot find tag format-version." .
527 "This is required in header." );
530 # If the line is not null, check it contains at least one colon
531 $self->_check_colon( $line, $line_counter );
533 # These are the allowed headers. Any other headers will be ignored
534 if ( $line =~ /^(\[|format-version:
535 |data-version:
536 |typeref:
537 |version:
538 |date:
539 |saved-by:
540 |auto-generated-by:
541 |default-namespace:
542 |remark:
543 |subsetdef:
544 |import:
545 |synonymtypedef:
546 |idspace:
547 |default-relationship-id-prefix:
548 |id-mapping:
552 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
553 ( $tag, $value ) = ( $1, $2 );
556 if ( $tag =~ /format-version/) {
557 $format_version_header_flag = 1;
558 }elsif( $tag =~ /default-namespace/ ) {
559 $default_namespace_header_flag = 1;
562 my $header = Bio::Annotation::SimpleValue->new( -value => $value );
563 $annotation_collection->add_Annotation( $tag, $header );
565 # Assign the Ontology name as the value of the default-namespace header
566 if ( $tag =~ /default-namespace/i ) {
567 $self->ontology_name($value);
573 # Parses each stanza of the file
574 sub _next_term {
575 my $self = shift;
576 my $term;
577 my $skip_stanza_flag = 1;
578 my $line_counter = $self->{'_current_line_no'};
580 while ( my $line = $self->_readline() ) {
581 ++$line_counter;
582 my $line = $self->_filter_line($line);
584 if ( !$line && $term ) {
585 $self->{'_current_line_no'} = $line_counter;
586 return $term;
589 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { # New stanza
590 if ( uc($1) eq "TERM" ) {
591 $term = $self->_create_term_object;
592 $skip_stanza_flag = 0;
594 # Reset the relationships after each stanza
595 $self->{'_relationships'} = {};
596 $self->{'_isa_parents'} = undef;
598 elsif ( uc($1) eq "TYPEDEF" ) {
599 $skip_stanza_flag = 1;
600 # Check if this typedef is already defined by the relationship
602 else {
603 $skip_stanza_flag = 1;
604 $self->warn("OBO File Format Warning on line $line_counter $line\n"
605 . "Unrecognized stanza type found. Skipping this stanza." );
607 next;
610 # If the line is not null, check it contains at least one colon
611 $self->_check_colon( $line, $line_counter );
613 # If there is any tag value other than the list below move to the next tag
614 next if (( $line !~ /^(\[|id:
615 |is_anonymous:
616 |name:
617 |namespace:
618 |alt_id:
619 |def:
620 |comment:
621 |subset:
622 |synonym:
623 |xref:
624 |is_a:
625 |intersection_of:
626 |union_of:
627 |disjoint_from:
628 |relationship:
629 |is_obsolete:
630 |replaced_by:
631 |consider:
632 |created_by:
633 |creation_date:
635 ) || $skip_stanza_flag );
637 # Tag/value pair
638 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
639 my ( $tag, $val ) = ( $1, $2 );
641 # If no value for the tag throw a warning
642 if ( !$val ) {
643 $self->warn("OBO File Format Warning on line $line_counter $line\n" .
644 "Tag has no value."
648 my $qh;
649 ( $val, $qh ) = $self->_extract_quals($val);
650 my $val2 = $val;
651 $val2 =~ s/\\,/,/g;
652 $tag = uc($tag);
653 if ( $tag eq "ID" ) {
654 $term->identifier($val);
655 if ( $self->_has_term($term) ) {
656 $term = $self->_ont_engine()->get_terms($val);
659 elsif ( $tag eq "NAME" ) {
660 $term->name($val);
662 elsif ( $tag eq "XREF_ANALOG" ) {
663 if ( !$term->has_dbxref($val) ) {
664 $term->add_dbxref(-dbxrefs => $self->_to_annotation( [$val] ) );
667 elsif ( $tag eq "XREF_UNKNOWN" ) {
668 $term->add_dbxref(-dbxrefs => $self->_to_annotation( [$val] ) );
670 elsif ( $tag eq "NAMESPACE" ) {
671 $term->namespace($val);
673 elsif ( $tag eq "DEF" ) {
674 my ( $defstr, $parts ) = $self->_extract_qstr($val);
675 $term->definition($defstr);
676 my $ann = $self->_to_annotation($parts);
677 $term->add_dbxref( -dbxrefs => $ann );
679 elsif ( $tag eq "SYNONYM" ) {
680 $term->add_synonym($val);
682 elsif ( $tag eq "ALT_ID" ) {
683 $term->add_secondary_id($val);
685 elsif ( $tag =~ /XREF/i ) {
686 $term->add_secondary_id($val);
688 elsif ( $tag eq "IS_OBSOLETE" ) {
689 if ( $val eq 'true' ) {
690 $val = 1;
692 elsif ( $val eq 'false' ) {
693 $val = 0;
695 $term->is_obsolete($val);
697 elsif ( $tag eq "COMMENT" ) {
698 $term->comment($val);
700 elsif ( $tag eq "RELATIONSHIP" ) {
701 $self->_handle_relationship_tag($val);
703 elsif ( $tag eq "IS_A" ) {
704 $val =~ s/ //g;
705 my $parent_term = $self->_create_term_object();
706 $parent_term->identifier($val);
707 push @{ $self->{'_isa_parents'} }, $parent_term;
712 $term;
716 # Creates a Bio::Ontology::OBOterm object
717 sub _create_term_object {
718 my ($self) = @_;
719 my $term = $self->term_factory->create_object();
720 $term;
723 sub _extract_quals {
724 my ( $self, $str ) = @_;
726 my %q = ();
727 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
728 my $return_str = $1;
729 my $extr = $2;
730 if ($extr) {
731 my @qparts = $self->_split_on_comma($extr);
732 foreach (@qparts) {
733 if (/(\w+)=\"(.*)\"/) {
734 $q{$1} = $2;
736 elsif (/(\w+)=\'(.*)\'/) {
737 $q{$1} = $2;
739 else {
740 warn("$_ in $str");
744 return ( $return_str, \%q );
746 else {
747 return ( $str, {} );
751 sub _extract_qstr {
752 my ( $self, $str ) = @_;
754 my ( $extr, $rem, $prefix ) = extract_quotelike($str);
755 my $txt = $extr;
756 $txt =~ s/^\"//;
757 $txt =~ s/\"$//;
758 if ($prefix) {
759 warn("illegal prefix: $prefix in: $str");
762 my @extra = ();
764 # e.g. synonym: "foo" EXACT [...]
765 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
766 $rem = $2;
767 push( @extra, split( ' ', $1 ) );
770 my @parts = ();
771 while ( ( $extr, $rem, $prefix ) = extract_bracketed( $rem, '[]' ) ) {
772 last unless $extr;
773 $extr =~ s/^\[//;
774 $extr =~ s/\]$//;
775 push( @parts, $extr ) if $extr;
777 @parts =
778 map { $self->_split_on_comma($_) } @parts;
780 $txt =~ s/\\//g;
782 ( $txt, \@parts, \@extra );
785 sub _split_on_comma {
786 my ( $self, $str ) = @_;
787 my @parts = ();
788 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
789 $str = $1;
790 my $part = $2;
791 unshift( @parts, $part );
792 $str =~ s/,\s*$//;
794 unshift( @parts, $str );
796 return map { s/\\//g; $_ } @parts;
799 # This method checks for an existing colon in a line
800 sub _check_colon {
801 my ( $self, $line, $line_no ) = @_;
802 if ( $line && !( $line =~ /:/ ) ) {
803 $self->throw("OBO File Format Error on line $line_no $line\n" .
804 "Cannot find key-terminating colon"
809 # This method handles relationship tags
810 sub _handle_relationship_tag {
811 my ( $self, $val ) = @_;
812 my @parts = split( / /, $val );
813 my $relationship = uc($parts[0]);
814 my $id = $parts[1] =~ /\^(w+)\s+\!/ ? $1 : $parts[1];
815 my $parent_term = $self->_create_term_object();
816 $parent_term->identifier($id);
818 if ( my $realtionships_hash = $self->{'_relationships'} ) {
819 my $id_array_ref = $$realtionships_hash{$relationship};
820 if ( !$id_array_ref ) {
821 my @ids;
822 push( @ids, $id );
823 $$realtionships_hash{$relationship} = \@ids;
826 else {
827 push( @$id_array_ref, $id );
832 # Convert simple strings to Bio::Annotation::DBLinks
833 sub _to_annotation {
834 my ($self , $links) = @_;
835 return unless $links;
836 my @dbxrefs;
837 for my $string (@{$links}) {
838 my ($db, $id) = split(':',$string);
839 push @dbxrefs, Bio::Annotation::DBLink->new(-database => $db, -primary_id => $id);
842 \@dbxrefs;