Minor edits
[bioperl-live.git] / Bio / OntologyIO / obo.pm
blob843d6f7e8ceaaba6e9124408b836f853d167fd3e
2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
13 =head1 NAME
15 Bio::OntologyIO::obo - a parser for OBO flat-file format from Gene Ontology Consortium
17 =head1 SYNOPSIS
19 use Bio::OntologyIO;
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
23 ( -format => "obo",
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms)," root terms, and ",
29 scalar($ont->get_all_terms)," total terms, and ",
30 scalar($ont->get_leaf_terms)," leaf terms\n";
34 =head1 DESCRIPTION
36 Needs Graph.pm from CPAN.
38 =head1 FEEDBACK
40 =head2 Mailing Lists
42 User feedback is an integral part of the evolution of this and other
43 Bioperl modules. Send your comments and suggestions preferably to the
44 Bioperl mailing lists Your participation is much appreciated.
46 bioperl-l@bioperl.org - General discussion
47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 =head2 Support
51 Please direct usage questions or support issues to the mailing list:
53 I<bioperl-l@bioperl.org>
55 rather than to the module maintainer directly. Many experienced and
56 reponsive experts will be able look at the problem and quickly
57 address it. Please include a thorough description of the problem
58 with code and data examples if at all possible.
60 =head2 Reporting Bugs
62 Report bugs to the Bioperl bug tracking system to help us keep track
63 the bugs and their resolution. Bug reports can be submitted via the
64 web:
66 https://github.com/bioperl/bioperl-live/issues
68 =head1 AUTHOR
70 Sohel Merchant
72 Email: s-merchant@northwestern.edu
75 Address:
77 Northwestern University
78 Center for Genetic Medicine (CGM), dictyBase
79 Suite 1206,
80 676 St. Clair st
81 Chicago IL 60611
83 =head2 CONTRIBUTOR
85 Hilmar Lapp, hlapp at gmx.net
86 Chris Mungall, cjm at fruitfly.org
88 =head1 APPENDIX
90 The rest of the documentation details each of the object
91 methods. Internal methods are usually preceded with a _
93 =cut
95 package Bio::OntologyIO::obo;
97 use strict;
99 use Bio::Root::IO;
100 use Bio::Ontology::OBOEngine;
101 use Bio::Ontology::Ontology;
102 use Bio::Ontology::OntologyStore;
103 use Bio::Ontology::TermFactory;
104 use Bio::Annotation::Collection;
105 use Text::Balanced qw(extract_quotelike extract_bracketed);
107 use constant TRUE => 1;
108 use constant FALSE => 0;
110 use base qw(Bio::OntologyIO);
112 =head2 new
114 Title : new
115 Usage : $parser = Bio::OntologyIO->new(
116 -format => "obo",
117 -file => "gene_ontology.obo");
118 Function: Creates a new dagflat parser.
119 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
120 Args : -file => a single ontology flat file holding the
121 terms, descriptions and relationships
122 -ontology_name => the name of the ontology; if not specified the
123 parser will assign the name of the ontology as the
124 default-namespace header value from the OBO file.
125 -engine => the Bio::Ontology::OntologyEngineI object
126 to be reused (will be created otherwise); note
127 that every Bio::Ontology::OntologyI will
128 qualify as well since that one inherits from the
129 former.
131 See L<Bio::OntologyIO>.
133 =cut
135 # in reality, we let OntologyIO::new do the instantiation, and override
136 # _initialize for all initialization work
137 sub _initialize {
138 my ( $self, %arg ) = @_;
140 my ( $file, $name, $eng ) = $self->_rearrange(
142 qw( FILE
143 ONTOLOGY_NAME
144 ENGINE)
146 %arg
149 $self->SUPER::_initialize(%arg);
150 delete $self->{'_ontologies'};
152 # ontology engine (and possibly name if it's an OntologyI)
153 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
154 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
155 $self->ontology_name( $eng->name() );
156 $eng = $eng->engine() if $eng->can('engine');
158 $self->_ont_engine($eng);
160 $self->ontology_name($name) if $name;
162 } # _initialize
164 =head2 ontology_name
166 Title : ontology_name
167 Usage : $obj->ontology_name($newval)
168 Function: Get/set the name of the ontology parsed by this module.
169 Example :
170 Returns : value of ontology_name (a scalar)
171 Args : on set, new value (a scalar or undef, optional)
174 =cut
176 sub ontology_name {
177 my $self = shift;
179 return $self->{'ontology_name'} = shift if @_;
180 return $self->{'ontology_name'};
183 =head2 parse
185 Title : parse()
186 Usage : $parser->parse();
187 Function: Parses the files set with "new" or with methods
188 defs_file and _flat_files.
190 Normally you should not need to call this method as it will
191 be called automatically upon the first call to
192 next_ontology().
194 Returns : Bio::Ontology::OntologyEngineI
195 Args :
197 =cut
199 sub parse {
200 my $self = shift;
202 # Setup the default term factory if not done by anyone yet
203 $self->term_factory(Bio::Ontology::TermFactory->new( -type => "Bio::Ontology::OBOterm" ) )
204 unless $self->term_factory();
206 # Parse the file header
207 my $annotations_collection = $self->_header();
209 # Create the default ontology object itself
210 my $ont = Bio::Ontology::Ontology->new(
211 -name => $self->ontology_name(),
212 -engine => $self->_ont_engine()
215 # Assign the file headers
216 $ont->annotation($annotations_collection);
218 # Set up the ontology of the relationship types
219 for (
220 $self->_part_of_relationship(),
221 $self->_is_a_relationship(),
222 $self->_related_to_relationship(),
223 $self->_regulates_relationship(),
224 $self->_positively_regulates_relationship(),
225 $self->_negatively_regulates_relationship(),
228 $_->ontology($ont);
231 $self->_add_ontology($ont);
233 # Adding new terms
234 while ( my $term = $self->_next_term() ) {
236 # Check if the terms has a valid ID and NAME otherwise ignore the term
237 if ( !$term->identifier() || !$term->name() ) {
238 $self->throw( "OBO File Format Error on line "
239 . $self->{'_current_line_no'}
240 . " \nThe term does not have a id/name tag. This term will be ignored.\n"
242 next;
245 my $new_ontology_flag = 1;
246 my $ontologies_array_ref = $self->{'_ontologies'};
248 for my $ontology ( @$ontologies_array_ref ) {
249 my ($oname, $t_ns) = ( $ontology->name, $term->namespace );
250 next unless ( defined($oname) && defined($t_ns) );
251 if ( $oname eq $t_ns ) {
252 # No need to create new ontology
253 $new_ontology_flag = 0;
254 $ont = $ontology;
258 if ( $new_ontology_flag && $term->namespace ) {
259 my $new_ont = Bio::Ontology::Ontology->new(
260 -name => $term->namespace,
261 -engine => $self->_ont_engine
263 $new_ont->annotation($annotations_collection);
264 $self->_add_ontology($new_ont);
265 $ont = $new_ont;
268 $self->_add_term( $term, $ont );
270 # Adding the IS_A relationship
271 for my $parent_term ( @{$self->{'_isa_parents'}} ) {
272 # Check if parent exists, if not then add the term to the graph.
273 if ( ! $self->_has_term($parent_term) ) {
274 $self->_add_term( $parent_term, $ont ); # !
277 $self->_add_relationship( $parent_term, $term,
278 $self->_is_a_relationship(), $ont );
281 # Adding the other relationships like part_of, related_to, develops_from
282 my $relationship_hash_ref = $self->{'_relationships'};
283 for my $relationship ( keys %{$relationship_hash_ref} ) {
284 my $reltype;
285 # Check if relationship exists, if not add it
286 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
287 $reltype = $self->_ont_engine->get_relationship_type($relationship);
289 else {
290 $self->_ont_engine->add_relationship_type( $relationship, $ont );
291 $reltype = $self->_ont_engine->get_relationship_type($relationship);
294 # Check if the id already exists in the graph
295 my $id_array_ref = $$relationship_hash_ref{$relationship};
296 for my $id (@$id_array_ref) {
297 my $parent_term = $self->_create_term_object();
298 $parent_term->identifier($id);
299 $parent_term->ontology($ont);
301 if ( ! $self->_has_term($parent_term) ) {
302 $self->_add_term( $parent_term, $ont );
305 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
310 return $self->_ont_engine();
313 =head2 next_ontology
315 Title : next_ontology
316 Usage :
317 Function: Get the next available ontology from the parser. This is the
318 method prescribed by Bio::OntologyIO.
319 Example :
320 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
321 there is no more ontology in the input.
322 Args :
325 =cut
327 sub next_ontology {
328 my $self = shift;
330 # parse if not done already
331 $self->parse() unless exists( $self->{'_ontologies'} );
333 # return next available ontology
334 if ( exists( $self->{'_ontologies'} ) ) {
335 my $ont = shift( @{ $self->{'_ontologies'} } );
336 if ($ont) {
337 my $store = Bio::Ontology::OntologyStore->new();
338 $store->register_ontology($ont);
340 return $ont;
343 return;
346 =head2 close
348 Title : close
349 Usage :
350 Function: Closes this ontology stream and associated file handles.
352 Clients should call this method especially when they write
353 ontologies.
355 We need to override this here in order to close the file
356 handle for the term definitions file.
358 Example :
359 Returns : none
360 Args : none
363 =cut
365 sub close {
366 my $self = shift;
368 # first call the inherited implementation
369 $self->SUPER::close();
372 # INTERNAL METHODS
373 # ----------------
375 sub _add_ontology {
376 my $self = shift;
377 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
378 foreach my $ont (@_) {
379 $self->throw(
380 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
381 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
383 # the ontology name may have been auto-discovered while parsing
384 # the file
385 $ont->name( $self->ontology_name ) unless $ont->name();
386 push( @{ $self->{'_ontologies'} }, $ont );
390 # This simply delegates. See OBOEngine.
391 sub _add_term {
392 my ( $self, $term, $ont ) = @_;
393 $term->ontology($ont) if $ont && ( !$term->ontology );
394 $self->_ont_engine()->add_term($term);
395 } # _add_term
397 # This simply delegates. See OBOEngine
398 sub _part_of_relationship {
399 my $self = shift;
401 return $self->_ont_engine()->part_of_relationship(@_);
402 } # _part_of_relationship
404 # This simply delegates. See OBOEngine
405 sub _is_a_relationship {
406 my $self = shift;
408 return $self->_ont_engine()->is_a_relationship(@_);
409 } # _is_a_relationship
411 # This simply delegates. See OBOEngine
412 sub _related_to_relationship {
413 my $self = shift;
415 return $self->_ont_engine()->related_to_relationship(@_);
416 } # _is_a_relationship
419 # This simply delegates. See OBOEngine
420 sub _regulates_relationship {
421 my $self = shift;
423 return $self->_ont_engine()->regulates_relationship(@_);
424 } # _part_of_relationship
426 # This simply delegates. See OBOEngine
427 sub _positively_regulates_relationship {
428 my $self = shift;
430 return $self->_ont_engine()->positively_regulates_relationship(@_);
431 } # _part_of_relationship
434 # This simply delegates. See OBOEngine
435 sub _negatively_regulates_relationship {
436 my $self = shift;
438 return $self->_ont_engine()->negatively_regulates_relationship(@_);
439 } # _part_of_relationship
441 # This simply delegates. See OBOEngine
442 sub _add_relationship {
443 my ( $self, $parent, $child, $type, $ont ) = @_;
445 # note the triple terminology (subject,predicate,object) corresponds to
446 # (child,type,parent)
447 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
449 } # _add_relationship
451 # This simply delegates. See OBOEngine
452 sub _has_term {
453 my $self = shift;
455 return $self->_ont_engine()->has_term(@_);
456 } # _add_term
458 # Holds the OBO engine to be parsed into
459 sub _ont_engine {
460 my ( $self, $value ) = @_;
462 if ( defined $value ) {
463 $self->{"_ont_engine"} = $value;
466 return $self->{"_ont_engine"};
467 } # _ont_engine
469 # Removes the escape chracters from the file
470 sub _filter_line {
471 my ( $self, $line ) = @_;
473 chomp($line);
474 $line =~ tr [\200-\377]
475 [\000-\177]; # see 'man perlop', section on tr/
476 # weird ascii characters should be excluded
477 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
478 # preserve \11 (9 - tab) and \12 (10-linefeed)
479 $line =~ tr/\13\14//d; # remove weird characters; 11,12
480 # preserve \15 (13 - carriage return)
481 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
482 $line =~ tr/\177//d; # remove DEL character
484 $line =~ s/^\!.*//;
485 $line =~ s/[^\\]\!.*//;
486 $line =~ s/[^\\]\#.*//;
487 $line =~ s/^\s+//;
488 $line =~ s/\s+$//;
490 return $line;
493 # Parses the header
494 sub _header {
495 my $self = shift;
496 my $annotation_collection = Bio::Annotation::Collection->new();
497 my ( $tag, $value );
498 my $line_counter = 0;
499 $self->{'_current_line_no'} = 0;
500 my $format_version_header_flag = 0;
501 my $default_namespace_header_flag = 0;
503 while ( my $line = $self->_readline() ) {
504 ++$line_counter;
505 my $line = $self->_filter_line($line);
506 if ( !$line ) {
507 if ( !$format_version_header_flag || !$default_namespace_header_flag) {
508 $self->throw(
509 "OBO File Format Error - \nCannot find tag format-version and/ default-namespace . These are required header.\n"
513 $self->{'_current_line_no'} = $line_counter;
514 return $annotation_collection;
517 ### CHeck if there is a header
518 if($line =~ /\[\w*\]/) {
519 $self->throw(
520 "OBO File Format Error - \nCannot find tag format-version. Thi ia a required header.\n"
525 ### If the line is not null, check it contains atleasdt one colon
526 $self->_check_colon( $line, $line_counter );
528 ### Thsse ar the allowed headers. Any other headers will be ignored
529 if ( $line =~
530 /^(\[|format-version:|typeref:|version:|date:|saved-by:|auto-generated-by:|default-namespace:|remark:|subsetdef:)/
533 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
534 ( $tag, $value ) = ( $1, $2 );
537 if ( $tag =~ /format-version/) {
538 $format_version_header_flag = 1;
539 }elsif( $tag =~ /default-namespace/ ) {
540 $default_namespace_header_flag = 1;
543 my $header = Bio::Annotation::SimpleValue->new( -value => $value );
544 $annotation_collection->add_Annotation( $tag, $header );
546 #### Assign the Ontology name as the value of the default-namespace header
547 if ( $tag =~ /default-namespace/i ) {
549 $self->ontology_name($value);
558 ### Parses each stanza of the file
559 sub _next_term {
560 my $self = shift;
561 my $term ;
562 my $skip_stanza_flag = 1;
563 my $line_counter = $self->{'_current_line_no'};
565 while ( my $line = $self->_readline() ) {
566 ++$line_counter;
567 my $line = $self->_filter_line($line);
568 if ( !$line && $term ) {
569 $self->{'_current_line_no'} = $line_counter;
570 return $term;
573 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { #New stanza
575 if ( uc($1) eq "TERM" ) {
577 $term = $self->_create_term_object;
578 $skip_stanza_flag = 0;
579 ### Reset the relationships after each stanza
580 $self->{'_relationships'} = {};
581 $self->{'_isa_parents'} = undef;
583 elsif ( uc($1) eq "TYPEDEF" ) {
584 $skip_stanza_flag = 1;
585 ### Check if this typedef is already defined by the relationship
587 else {
588 $skip_stanza_flag = 1;
589 $self->warn(
590 "OBO File Format Warning on line $line_counter $line \nUnrecognized stanza type found. Skipping this stanza.\n"
593 next;
596 ### If the line is not null, check it contains atleasdt one colon
597 $self->_check_colon( $line, $line_counter );
599 ### if there is any tag value other thn the list below move to the next tag
600 next
601 if (
603 $line !~
604 /^(\[|id:|name:|is_a:|relationship:|namespace:|is_obsolete:|alt_id:|def:|xref_analog:|exact_synonym:|broad_synonym:|related_synonym:|synonym:|comment:|xref:)/
606 || $skip_stanza_flag
609 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) { #TAg Value pair
610 my ( $tag, $val ) = ( $1, $2 );
612 ### If no value for the tag thrown a warning
613 if ( !$val ) {
614 $self->warn(
615 "OBO File Format Warning on line $line_counter $line \nTag has no value\n"
619 my $qh;
620 ( $val, $qh ) = $self->_extract_quals($val);
621 my $val2 = $val;
622 $val2 =~ s/\\,/,/g;
623 $tag = uc($tag);
624 if ( $tag eq "ID" ) {
626 $term->identifier($val);
627 if ( $self->_has_term($term) ) {
628 $term = $self->_ont_engine()->get_terms($val);
632 elsif ( $tag eq "NAME" ) {
633 $term->name($val);
635 elsif ( $tag eq "XREF_ANALOG" ) {
636 if ( !$term->has_dbxref($val) ) {
637 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
640 elsif ( $tag eq "XREF_UNKNOWN" ) {
641 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
643 elsif ( $tag eq "NAMESPACE" ) {
644 $term->namespace($val);
646 elsif ( $tag eq "DEF" ) {
647 my ( $defstr, $parts ) = $self->_extract_qstr($val);
648 $term->definition($defstr);
649 my $ann = $self->_to_annotation($parts);
650 $term->add_dbxref(-dbxrefs => $ann);
652 elsif ( $tag =~ /(\w*)synonym/i ) {
653 #$val =~ s/['"\[\]]//g; #NML commented out b/c need quotes
654 $term->add_synonym($val);
656 elsif ( $tag eq "ALT_ID" ) {
657 $term->add_secondary_id($val);
659 elsif ( $tag =~ /XREF/i ) {
660 $term->add_secondary_id($val);
662 elsif ( $tag eq "IS_OBSOLETE" ) {
664 if ( $val eq 'true' ) {
665 $val = 1;
667 if ( $val eq 'false' ) {
668 $val = 0;
670 $term->is_obsolete($val);
672 elsif ( $tag eq "COMMENT" ) {
673 $term->comment($val);
675 elsif ( $tag eq "RELATIONSHIP" ) {
676 $self->_handle_relationship_tag($val);
678 elsif ( $tag eq "IS_A" ) {
680 $val =~ s/ //g;
681 my $parent_term = $self->_create_term_object();
682 $parent_term->identifier($val);
684 if ( $self->{'_isa_parents'} ) {
685 my $isa_parents_array_ref = $self->{'_isa_parents'};
686 push( @$isa_parents_array_ref, $parent_term );
688 else {
689 my @terms_array;
690 push( @terms_array, $parent_term );
691 $self->{'_isa_parents'} = \@terms_array;
697 $term;
700 # Creates a Bio::Ontology::OBOterm object
701 sub _create_term_object {
703 my ($self) = @_;
704 my $term = $self->term_factory->create_object();
705 return $term;
710 sub _extract_quals {
711 my ( $self, $str ) = @_;
713 my %q = ();
714 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
715 my $return_str = $1;
716 my $extr = $2;
717 if ($extr) {
718 my @qparts = $self->_split_on_comma($extr);
719 foreach (@qparts) {
720 if (/(\w+)=\"(.*)\"/) {
721 $q{$1} = $2;
723 elsif (/(\w+)=\'(.*)\'/) {
724 $q{$1} = $2;
726 else {
727 warn("$_ in $str");
731 return ( $return_str, \%q );
733 else {
734 return ( $str, {} );
738 sub _extract_qstr {
739 my ( $self, $str ) = @_;
741 my ( $extr, $rem, $prefix ) = extract_quotelike($str);
742 my $txt = $extr;
743 $txt =~ s/^\"//;
744 $txt =~ s/\"$//;
745 if ($prefix) {
746 warn("illegal prefix: $prefix in: $str");
749 my @extra = ();
751 # eg synonym: "foo" EXACT [...]
752 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
753 $rem = $2;
754 push( @extra, split( ' ', $1 ) );
757 my @parts = ();
758 while ( ( $extr, $rem, $prefix ) = extract_bracketed( $rem, '[]' ) ) {
759 last unless $extr;
760 $extr =~ s/^\[//;
761 $extr =~ s/\]$//;
762 push( @parts, $extr ) if $extr;
764 @parts =
765 map { $self->_split_on_comma($_) } @parts;
767 $txt =~ s/\\//g;
768 return ( $txt, \@parts, \@extra );
771 sub _split_on_comma {
772 my ( $self, $str ) = @_;
773 my @parts = ();
774 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
775 $str = $1;
776 my $part = $2;
777 unshift( @parts, $part );
778 $str =~ s/,\s*$//;
780 unshift( @parts, $str );
781 return map { s/\\//g; $_ } @parts;
784 # This method checks for an existing colon in a line
785 sub _check_colon {
786 my ( $self, $line, $line_no ) = @_;
787 if ( $line && !( $line =~ /:/ ) ) {
788 $self->throw(
789 "OBO File Format Error on line $line_no $line - \nCannot find key-terminating colon\n"
794 # This method handles relationship tags
795 sub _handle_relationship_tag {
796 my ( $self, $val ) = @_;
797 my @parts = split( / /, $val );
798 my $relationship = uc($parts[0]);
799 my $id = $parts[1] =~ /\^(w+)\s+\!/ ? $1 : $parts[1];
800 my $parent_term = $self->_create_term_object();
801 $parent_term->identifier($id);
803 if ( my $realtionships_hash = $self->{'_relationships'} ) {
804 my $id_array_ref = $$realtionships_hash{$relationship};
805 if ( !$id_array_ref ) {
806 my @ids;
807 push( @ids, $id );
808 $$realtionships_hash{$relationship} = \@ids;
811 else {
812 push( @$id_array_ref, $id );
819 # convert simple strings to Bio::Annotation::DBLinks
820 sub _to_annotation {
821 my ($self , $links) = @_;
822 return unless $links;
823 my @dbxrefs;
824 for my $string (@{$links}) {
825 my ($db, $id) = split(':',$string);
826 push @dbxrefs, Bio::Annotation::DBLink->new(-database => $db, -primary_id => $id);
828 return \@dbxrefs;