Merge remote branch 'upstream/master' into topic/tree_dbsqlite_memoryfix
[bioperl-live.git] / Bio / OntologyIO / obo.pm
blob11ffa931ba22fe9aac61f143064a43dacba4792d
2 # BioPerl module for Bio::OntologyIO::obo
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
8 # Copyright Sohel Merchant
10 # You may distribute this module under the same terms as perl itself
13 =head1 NAME
15 Bio::OntologyIO::obo - a parser for OBO flat-file format from Gene Ontology Consortium
17 =head1 SYNOPSIS
19 use Bio::OntologyIO;
21 # do not use directly -- use via Bio::OntologyIO
22 my $parser = Bio::OntologyIO->new
23 ( -format => "obo",
24 -file => "gene_ontology.obo");
26 while(my $ont = $parser->next_ontology()) {
27 print "read ontology ",$ont->name()," with ",
28 scalar($ont->get_root_terms)," root terms, and ",
29 scalar($ont->get_all_terms)," total terms, and ",
30 scalar($ont->get_leaf_terms)," leaf terms\n";
34 =head1 DESCRIPTION
36 Needs Graph.pm from CPAN.
38 =head1 FEEDBACK
40 =head2 Mailing Lists
42 User feedback is an integral part of the evolution of this and other
43 Bioperl modules. Send your comments and suggestions preferably to the
44 Bioperl mailing lists Your participation is much appreciated.
46 bioperl-l@bioperl.org - General discussion
47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 =head2 Support
51 Please direct usage questions or support issues to the mailing list:
53 I<bioperl-l@bioperl.org>
55 rather than to the module maintainer directly. Many experienced and
56 reponsive experts will be able look at the problem and quickly
57 address it. Please include a thorough description of the problem
58 with code and data examples if at all possible.
60 =head2 Reporting Bugs
62 Report bugs to the Bioperl bug tracking system to help us keep track
63 the bugs and their resolution. Bug reports can be submitted via the
64 web:
66 http://bugzilla.open-bio.org/
68 =head1 AUTHOR
70 Sohel Merchant
72 Email: s-merchant@northwestern.edu
75 Address:
77 Northwestern University
78 Center for Genetic Medicine (CGM), dictyBase
79 Suite 1206,
80 676 St. Clair st
81 Chicago IL 60611
83 =head2 CONTRIBUTOR
85 Hilmar Lapp, hlapp at gmx.net
86 Chris Mungall, cjm at fruitfly.org
88 =head1 APPENDIX
90 The rest of the documentation details each of the object
91 methods. Internal methods are usually preceded with a _
93 =cut
95 package Bio::OntologyIO::obo;
97 use strict;
99 use Bio::Root::IO;
100 use Bio::Ontology::OBOEngine;
101 use Bio::Ontology::Ontology;
102 use Bio::Ontology::OntologyStore;
103 use Bio::Ontology::TermFactory;
104 use Bio::Annotation::Collection;
105 use Text::Balanced qw(extract_quotelike extract_bracketed);
107 use constant TRUE => 1;
108 use constant FALSE => 0;
110 use base qw(Bio::OntologyIO);
112 =head2 new
114 Title : new
115 Usage : $parser = Bio::OntologyIO->new(
116 -format => "obo",
117 -file => "gene_ontology.obo");
118 Function: Creates a new dagflat parser.
119 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
120 Args : -file => a single ontology flat file holding the
121 terms, descriptions and relationships
122 -ontology_name => the name of the ontology; if not specified the
123 parser will assign the name of the ontology as the
124 default-namespace header value from the OBO file.
125 -engine => the Bio::Ontology::OntologyEngineI object
126 to be reused (will be created otherwise); note
127 that every Bio::Ontology::OntologyI will
128 qualify as well since that one inherits from the
129 former.
131 See L<Bio::OntologyIO>.
133 =cut
135 # in reality, we let OntologyIO::new do the instantiation, and override
136 # _initialize for all initialization work
137 sub _initialize {
138 my ( $self, %arg ) = @_;
140 my ( $file, $name, $eng ) = $self->_rearrange(
142 qw( FILE
143 ONTOLOGY_NAME
144 ENGINE)
146 %arg
149 $self->SUPER::_initialize(%arg);
150 delete $self->{'_ontologies'};
152 # ontology engine (and possibly name if it's an OntologyI)
153 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
154 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
155 $self->ontology_name( $eng->name() );
156 $eng = $eng->engine() if $eng->can('engine');
158 $self->_ont_engine($eng);
160 $self->ontology_name($name) if $name;
162 } # _initialize
164 =head2 ontology_name
166 Title : ontology_name
167 Usage : $obj->ontology_name($newval)
168 Function: Get/set the name of the ontology parsed by this module.
169 Example :
170 Returns : value of ontology_name (a scalar)
171 Args : on set, new value (a scalar or undef, optional)
174 =cut
176 sub ontology_name {
177 my $self = shift;
179 return $self->{'ontology_name'} = shift if @_;
180 return $self->{'ontology_name'};
183 =head2 parse
185 Title : parse()
186 Usage : $parser->parse();
187 Function: Parses the files set with "new" or with methods
188 defs_file and _flat_files.
190 Normally you should not need to call this method as it will
191 be called automatically upon the first call to
192 next_ontology().
194 Returns : Bio::Ontology::OntologyEngineI
195 Args :
197 =cut
199 sub parse {
200 my $self = shift;
202 # setup the default term factory if not done by anyone yet
203 $self->term_factory(
204 Bio::Ontology::TermFactory->new( -type => "Bio::Ontology::OBOterm" ) )
205 unless $self->term_factory();
207 ## Parse the file header
208 my $annotations_collection = $self->_header();
210 # create the default ontology object itself
211 my $ont = Bio::Ontology::Ontology->new(
212 -name => $self->ontology_name(),
213 -engine => $self->_ont_engine()
216 ## Assign the file headers
217 $ont->annotation($annotations_collection);
219 # set up the ontology of the relationship types
220 foreach (
221 $self->_part_of_relationship(),
222 $self->_is_a_relationship(),
223 $self->_related_to_relationship(),
224 $self->_regulates_relationship(),
225 $self->_positively_regulates_relationship(),
226 $self->_negatively_regulates_relationship(),
229 $_->ontology($ont);
232 ##################################
233 $self->_add_ontology($ont);
234 ##################################
236 ### Adding new terms
237 while ( my $term = $self->_next_term() ) {
239 ### CHeck if the terms has a valid ID and NAME otherwise ignore the term
240 if ( !$term->identifier() || !$term->name() ) {
241 $self->throw( "OBO File Format Error on line "
242 . $self->{'_current_line_no'}
243 . " \nThe term does not have a id/name tag. This term will be ignored.\n"
245 next;
248 #print $term->identifier(),"\t",$term->name(),"\n";
250 my $new_ontology_flag = 1;
251 my $ontologies_array_ref = $self->{'_ontologies'};
252 foreach my $ontology (@$ontologies_array_ref) {
253 my ($oname, $t_ns) = ($ontology->name(), $term->namespace() );
254 next unless (defined($oname) && defined($t_ns));
255 if ( $oname eq $t_ns ) {
256 ### No need to create new ontology
257 $new_ontology_flag = 0;
258 $ont = $ontology;
262 if ( $new_ontology_flag && $term->namespace() ) {
263 my $new_ont = Bio::Ontology::Ontology->new(
264 -name => $term->namespace(),
265 -engine => $self->_ont_engine()
267 $new_ont->annotation($annotations_collection);
268 $self->_add_ontology($new_ont);
269 $ont = $new_ont;
273 $self->_add_term( $term, $ont );
275 #### Addding the IS_A relationship
276 my $isa_parents_array_ref = $self->{'_isa_parents'};
277 foreach my $parent_term (@$isa_parents_array_ref) {
278 ### Check if parent exist, if not then add the term to the graph.
279 if ( !( $self->_has_term($parent_term) ) ) {
280 $self->_add_term( $parent_term, $ont );
283 $self->_add_relationship( $parent_term, $term,
284 $self->_is_a_relationship(), $ont );
287 #### Addding the other relationships like part_of, realted_to, develpos_from
288 my $relationship_hash_ref = $self->{'_relationships'};
289 foreach my $relationship ( keys %$relationship_hash_ref ) {
290 my $reltype;
291 #### Check if relationship exist, if not add it.
292 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
293 $reltype =
294 $self->_ont_engine->get_relationship_type($relationship);
296 else {
297 $self->_ont_engine->add_relationship_type( $relationship,
298 $ont );
299 $reltype =
300 $self->_ont_engine->get_relationship_type($relationship);
303 #### Check if the id already exist in the graph
304 my $id_array_ref = $$relationship_hash_ref{$relationship};
305 foreach my $id (@$id_array_ref) {
306 my $parent_term = $self->_create_term_object();
307 $parent_term->identifier($id);
308 $parent_term->ontology($ont);
310 if ( !( $self->_has_term($parent_term) ) ) {
311 $self->_add_term( $parent_term, $ont );
314 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
320 return $self->_ont_engine();
321 } # parse
323 =head2 next_ontology
325 Title : next_ontology
326 Usage :
327 Function: Get the next available ontology from the parser. This is the
328 method prescribed by Bio::OntologyIO.
329 Example :
330 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
331 there is no more ontology in the input.
332 Args :
335 =cut
337 sub next_ontology {
338 my $self = shift;
340 # parse if not done already
341 $self->parse() unless exists( $self->{'_ontologies'} );
343 # return next available ontology
344 if ( exists( $self->{'_ontologies'} ) ) {
345 my $ont = shift( @{ $self->{'_ontologies'} } );
346 if ($ont) {
347 my $store = Bio::Ontology::OntologyStore->new();
348 $store->register_ontology($ont);
350 return $ont;
353 return;
356 =head2 close
358 Title : close
359 Usage :
360 Function: Closes this ontology stream and associated file handles.
362 Clients should call this method especially when they write
363 ontologies.
365 We need to override this here in order to close the file
366 handle for the term definitions file.
368 Example :
369 Returns : none
370 Args : none
373 =cut
375 sub close {
376 my $self = shift;
378 # first call the inherited implementation
379 $self->SUPER::close();
382 # INTERNAL METHODS
383 # ----------------
385 sub _add_ontology {
386 my $self = shift;
387 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
388 foreach my $ont (@_) {
389 $self->throw(
390 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
391 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
393 # the ontology name may have been auto-discovered while parsing
394 # the file
395 $ont->name( $self->ontology_name ) unless $ont->name();
396 push( @{ $self->{'_ontologies'} }, $ont );
400 # This simply delegates. See OBOEngine.
401 sub _add_term {
402 my ( $self, $term, $ont ) = @_;
403 $term->ontology($ont) if $ont && ( !$term->ontology );
404 $self->_ont_engine()->add_term($term);
405 } # _add_term
407 # This simply delegates. See OBOEngine
408 sub _part_of_relationship {
409 my $self = shift;
411 return $self->_ont_engine()->part_of_relationship(@_);
412 } # _part_of_relationship
414 # This simply delegates. See OBOEngine
415 sub _is_a_relationship {
416 my $self = shift;
418 return $self->_ont_engine()->is_a_relationship(@_);
419 } # _is_a_relationship
421 # This simply delegates. See OBOEngine
422 sub _related_to_relationship {
423 my $self = shift;
425 return $self->_ont_engine()->related_to_relationship(@_);
426 } # _is_a_relationship
429 # This simply delegates. See OBOEngine
430 sub _regulates_relationship {
431 my $self = shift;
433 return $self->_ont_engine()->regulates_relationship(@_);
434 } # _part_of_relationship
436 # This simply delegates. See OBOEngine
437 sub _positively_regulates_relationship {
438 my $self = shift;
440 return $self->_ont_engine()->positively_regulates_relationship(@_);
441 } # _part_of_relationship
444 # This simply delegates. See OBOEngine
445 sub _negatively_regulates_relationship {
446 my $self = shift;
448 return $self->_ont_engine()->negatively_regulates_relationship(@_);
449 } # _part_of_relationship
451 # This simply delegates. See OBOEngine
452 sub _add_relationship {
453 my ( $self, $parent, $child, $type, $ont ) = @_;
455 # note the triple terminology (subject,predicate,object) corresponds to
456 # (child,type,parent)
457 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
459 } # _add_relationship
461 # This simply delegates. See OBOEngine
462 sub _has_term {
463 my $self = shift;
465 return $self->_ont_engine()->has_term(@_);
466 } # _add_term
468 # Holds the OBO engine to be parsed into
469 sub _ont_engine {
470 my ( $self, $value ) = @_;
472 if ( defined $value ) {
473 $self->{"_ont_engine"} = $value;
476 return $self->{"_ont_engine"};
477 } # _ont_engine
479 # Removes the escape chracters from the file
480 sub _filter_line {
481 my ( $self, $line ) = @_;
483 chomp($line);
484 $line =~ tr [\200-\377]
485 [\000-\177]; # see 'man perlop', section on tr/
486 # weird ascii characters should be excluded
487 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
488 # preserve \11 (9 - tab) and \12 (10-linefeed)
489 $line =~ tr/\13\14//d; # remove weird characters; 11,12
490 # preserve \15 (13 - carriage return)
491 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
492 $line =~ tr/\177//d; # remove DEL character
494 $line =~ s/^\!.*//;
495 $line =~ s/[^\\]\!.*//;
496 $line =~ s/[^\\]\#.*//;
497 $line =~ s/^\s+//;
498 $line =~ s/\s+$//;
500 return $line;
503 # Parses the header
504 sub _header {
505 my $self = shift;
506 my $annotation_collection = Bio::Annotation::Collection->new();
507 my ( $tag, $value );
508 my $line_counter = 0;
509 $self->{'_current_line_no'} = 0;
510 my $format_version_header_flag = 0;
511 my $default_namespace_header_flag = 0;
513 while ( my $line = $self->_readline() ) {
514 ++$line_counter;
515 my $line = $self->_filter_line($line);
516 if ( !$line ) {
517 if ( !$format_version_header_flag || !$default_namespace_header_flag) {
518 $self->throw(
519 "OBO File Format Error - \nCannot find tag format-version and/ default-namespace . These are required header.\n"
523 $self->{'_current_line_no'} = $line_counter;
524 return $annotation_collection;
527 ### CHeck if there is a header
528 if($line =~ /\[\w*\]/) {
529 $self->throw(
530 "OBO File Format Error - \nCannot find tag format-version. Thi ia a required header.\n"
535 ### If the line is not null, check it contains atleasdt one colon
536 $self->_check_colon( $line, $line_counter );
538 ### Thsse ar the allowed headers. Any other headers will be ignored
539 if ( $line =~
540 /^(\[|format-version:|typeref:|version:|date:|saved-by:|auto-generated-by:|default-namespace:|remark:|subsetdef:)/
543 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
544 ( $tag, $value ) = ( $1, $2 );
547 if ( $tag =~ /format-version/) {
548 $format_version_header_flag = 1;
549 }elsif( $tag =~ /default-namespace/ ) {
550 $default_namespace_header_flag = 1;
553 my $header = Bio::Annotation::SimpleValue->new( -value => $value );
554 $annotation_collection->add_Annotation( $tag, $header );
556 #### Assign the Ontology name as the value of the default-namespace header
557 if ( $tag =~ /default-namespace/i ) {
559 $self->ontology_name($value);
568 ### Parses each stanza of the file
569 sub _next_term {
570 my $self = shift;
571 my $term ;
572 my $skip_stanza_flag = 1;
573 my $line_counter = $self->{'_current_line_no'};
575 while ( my $line = $self->_readline() ) {
576 #print $line."\n";
577 ++$line_counter;
578 my $line = $self->_filter_line($line);
579 if ( !$line && $term ) {
580 $self->{'_current_line_no'} = $line_counter;
581 return $term;
584 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { #New stanza
586 if ( uc($1) eq "TERM" ) {
588 $term = $self->_create_term_object;
589 $skip_stanza_flag = 0;
590 ### Reset the relationships after each stanza
591 $self->{'_relationships'} = {};
592 $self->{'_isa_parents'} = undef;
594 elsif ( uc($1) eq "TYPEDEF" ) {
595 $skip_stanza_flag = 1;
596 ### Check if this typedef is already defined by the relationship
598 else {
599 $skip_stanza_flag = 1;
600 $self->warn(
601 "OBO File Format Warning on line $line_counter $line \nUnrecognized stanza type found. Skipping this stanza.\n"
604 next;
607 ### If the line is not null, check it contains atleasdt one colon
608 $self->_check_colon( $line, $line_counter );
610 ### if there is any tag value other thn the list below move to the next tag
611 next
612 if (
614 $line !~
615 /^(\[|id:|name:|is_a:|relationship:|namespace:|is_obsolete:|alt_id:|def:|xref_analog:|exact_synonym:|broad_synonym:|related_synonym:|synonym:|comment:|xref:)/
617 || $skip_stanza_flag
620 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) { #TAg Value pair
621 my ( $tag, $val ) = ( $1, $2 );
623 ### If no value for the tag thrown a warning
624 if ( !$val ) {
625 $self->warn(
626 "OBO File Format Warning on line $line_counter $line \nTag has no value\n"
630 my $qh;
631 ( $val, $qh ) = $self->_extract_quals($val);
632 my $val2 = $val;
633 $val2 =~ s/\\,/,/g;
634 $tag = uc($tag);
635 if ( $tag eq "ID" ) {
637 $term->identifier($val);
638 if ( $self->_has_term($term) ) {
639 $term = $self->_ont_engine()->get_terms($val);
643 elsif ( $tag eq "NAME" ) {
644 $term->name($val);
646 elsif ( $tag eq "XREF_ANALOG" ) {
647 if ( !$term->has_dbxref($val) ) {
648 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
651 elsif ( $tag eq "XREF_UNKNOWN" ) {
652 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
654 elsif ( $tag eq "NAMESPACE" ) {
655 $term->namespace($val);
657 elsif ( $tag eq "DEF" ) {
658 my ( $defstr, $parts ) = $self->_extract_qstr($val);
659 $term->definition($defstr);
660 my $ann = $self->_to_annotation($parts);
661 $term->add_dbxref(-dbxrefs => $ann);
663 elsif ( $tag =~ /(\w*)synonym/i ) {
664 #$val =~ s/['"\[\]]//g; #NML commented out b/c need quotes
665 $term->add_synonym($val);
667 elsif ( $tag eq "ALT_ID" ) {
668 $term->add_secondary_id($val);
670 elsif ( $tag =~ /XREF/i ) {
671 $term->add_secondary_id($val);
673 elsif ( $tag eq "IS_OBSOLETE" ) {
675 if ( $val eq 'true' ) {
676 $val = 1;
678 if ( $val eq 'false' ) {
679 $val = 0;
681 $term->is_obsolete($val);
683 elsif ( $tag eq "COMMENT" ) {
684 $term->comment($val);
686 elsif ( $tag eq "RELATIONSHIP" ) {
687 $self->_handle_relationship_tag($val);
689 elsif ( $tag eq "IS_A" ) {
691 $val =~ s/ //g;
692 my $parent_term = $self->_create_term_object();
693 $parent_term->identifier($val);
695 if ( $self->{'_isa_parents'} ) {
696 my $isa_parents_array_ref = $self->{'_isa_parents'};
697 push( @$isa_parents_array_ref, $parent_term );
699 else {
700 my @terms_array;
701 push( @terms_array, $parent_term );
702 $self->{'_isa_parents'} = \@terms_array;
707 return $term;
710 # Creates a Bio::Ontology::OBOterm object
711 sub _create_term_object {
713 my ($self) = @_;
714 my $term = $self->term_factory->create_object();
715 return $term;
720 sub _extract_quals {
721 my ( $self, $str ) = @_;
723 my %q = ();
724 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
725 my $return_str = $1;
726 my $extr = $2;
727 if ($extr) {
728 my @qparts = $self->_split_on_comma($extr);
729 foreach (@qparts) {
730 if (/(\w+)=\"(.*)\"/) {
731 $q{$1} = $2;
733 elsif (/(\w+)=\'(.*)\'/) {
734 $q{$1} = $2;
736 else {
737 warn("$_ in $str");
741 return ( $return_str, \%q );
743 else {
744 return ( $str, {} );
748 sub _extract_qstr {
749 my ( $self, $str ) = @_;
751 my ( $extr, $rem, $prefix ) = extract_quotelike($str);
752 my $txt = $extr;
753 $txt =~ s/^\"//;
754 $txt =~ s/\"$//;
755 if ($prefix) {
756 warn("illegal prefix: $prefix in: $str");
759 my @extra = ();
761 # eg synonym: "foo" EXACT [...]
762 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
763 $rem = $2;
764 push( @extra, split( ' ', $1 ) );
767 my @parts = ();
768 while ( ( $extr, $rem, $prefix ) = extract_bracketed( $rem, '[]' ) ) {
769 last unless $extr;
770 $extr =~ s/^\[//;
771 $extr =~ s/\]$//;
772 push( @parts, $extr ) if $extr;
774 @parts =
775 map { $self->_split_on_comma($_) } @parts;
777 $txt =~ s/\\//g;
778 return ( $txt, \@parts, \@extra );
781 sub _split_on_comma {
782 my ( $self, $str ) = @_;
783 my @parts = ();
784 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
785 $str = $1;
786 my $part = $2;
787 unshift( @parts, $part );
788 $str =~ s/,\s*$//;
790 unshift( @parts, $str );
791 return map { s/\\//g; $_ } @parts;
794 # This method checks for an existing colon in a line
795 sub _check_colon {
796 my ( $self, $line, $line_no ) = @_;
797 if ( $line && !( $line =~ /:/ ) ) {
798 $self->throw(
799 "OBO File Format Error on line $line_no $line - \nCannot find key-terminating colon\n"
804 # This method handles relationship tags
805 sub _handle_relationship_tag {
806 my ( $self, $val ) = @_;
807 my @parts = split( / /, $val );
808 my $relationship = uc($parts[0]);
809 my $id = $parts[1] =~ /\^(w+)\s+\!/ ? $1 : $parts[1];
810 my $parent_term = $self->_create_term_object();
811 $parent_term->identifier($id);
813 if ( my $realtionships_hash = $self->{'_relationships'} ) {
814 my $id_array_ref = $$realtionships_hash{$relationship};
815 if ( !$id_array_ref ) {
816 my @ids;
817 push( @ids, $id );
818 $$realtionships_hash{$relationship} = \@ids;
821 else {
822 push( @$id_array_ref, $id );
829 # convert simple strings to Bio::Annotation::DBLinks
830 sub _to_annotation {
831 my ($self , $links) = @_;
832 return unless $links;
833 my @dbxrefs;
834 for my $string (@{$links}) {
835 my ($db, $id) = split(':',$string);
836 push @dbxrefs, Bio::Annotation::DBLink->new(-database => $db, -primary_id => $id);
838 return \@dbxrefs;