sync w/ main trunk
[bioperl-live.git] / Bio / OntologyIO / obo.pm
blobc271ee00d2694b788878985fbffad0c63f2fe000
1 # $Id$
3 # BioPerl module for Bio::OntologyIO::obo
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sohel Merchant, s-merchant at northwestern.edu
9 # Copyright Sohel Merchant
11 # You may distribute this module under the same terms as perl itself
14 =head1 NAME
16 Bio::OntologyIO::obo - a parser for OBO flat-file format from Gene Ontology Consortium
18 =head1 SYNOPSIS
20 use Bio::OntologyIO;
22 # do not use directly -- use via Bio::OntologyIO
23 my $parser = Bio::OntologyIO->new
24 ( -format => "obo",
25 -file => "gene_ontology.obo");
27 while(my $ont = $parser->next_ontology()) {
28 print "read ontology ",$ont->name()," with ",
29 scalar($ont->get_root_terms)," root terms, and ",
30 scalar($ont->get_all_terms)," total terms, and ",
31 scalar($ont->get_leaf_terms)," leaf terms\n";
35 =head1 DESCRIPTION
37 Needs Graph.pm from CPAN.
39 =head1 FEEDBACK
41 =head2 Mailing Lists
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to the
45 Bioperl mailing lists Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 =head2 Support
52 Please direct usage questions or support issues to the mailing list:
54 L<bioperl-l@bioperl.org>
56 rather than to the module maintainer directly. Many experienced and
57 reponsive experts will be able look at the problem and quickly
58 address it. Please include a thorough description of the problem
59 with code and data examples if at all possible.
61 =head2 Reporting Bugs
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 the bugs and their resolution. Bug reports can be submitted via the
65 web:
67 http://bugzilla.open-bio.org/
69 =head1 AUTHOR
71 Sohel Merchant
73 Email: s-merchant@northwestern.edu
76 Address:
78 Northwestern University
79 Center for Genetic Medicine (CGM), dictyBase
80 Suite 1206,
81 676 St. Clair st
82 Chicago IL 60611
84 =head2 CONTRIBUTOR
86 Hilmar Lapp, hlapp at gmx.net
87 Chris Mungall, cjm at fruitfly.org
89 =head1 APPENDIX
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
94 =cut
96 package Bio::OntologyIO::obo;
98 use strict;
100 use Bio::Root::IO;
101 use Bio::Ontology::OBOEngine;
102 use Bio::Ontology::Ontology;
103 use Bio::Ontology::OntologyStore;
104 use Bio::Ontology::TermFactory;
105 use Bio::Annotation::Collection;
106 use Text::Balanced qw(extract_quotelike extract_bracketed);
108 use constant TRUE => 1;
109 use constant FALSE => 0;
111 use base qw(Bio::OntologyIO);
113 =head2 new
115 Title : new
116 Usage : $parser = Bio::OntologyIO->new(
117 -format => "obo",
118 -file => "gene_ontology.obo");
119 Function: Creates a new dagflat parser.
120 Returns : A new dagflat parser object, implementing Bio::OntologyIO.
121 Args : -file => a single ontology flat file holding the
122 terms, descriptions and relationships
123 -ontology_name => the name of the ontology; if not specified the
124 parser will assign the name of the ontology as the
125 default-namespace header value from the OBO file.
126 -engine => the Bio::Ontology::OntologyEngineI object
127 to be reused (will be created otherwise); note
128 that every Bio::Ontology::OntologyI will
129 qualify as well since that one inherits from the
130 former.
132 See L<Bio::OntologyIO>.
134 =cut
136 # in reality, we let OntologyIO::new do the instantiation, and override
137 # _initialize for all initialization work
138 sub _initialize {
139 my ( $self, %arg ) = @_;
141 my ( $file, $name, $eng ) = $self->_rearrange(
143 qw( FILE
144 ONTOLOGY_NAME
145 ENGINE)
147 %arg
150 $self->SUPER::_initialize(%arg);
151 delete $self->{'_ontologies'};
153 # ontology engine (and possibly name if it's an OntologyI)
154 $eng = Bio::Ontology::OBOEngine->new() unless $eng;
155 if ( $eng->isa("Bio::Ontology::OntologyI") ) {
156 $self->ontology_name( $eng->name() );
157 $eng = $eng->engine() if $eng->can('engine');
159 $self->_ont_engine($eng);
161 $self->ontology_name($name) if $name;
163 } # _initialize
165 =head2 ontology_name
167 Title : ontology_name
168 Usage : $obj->ontology_name($newval)
169 Function: Get/set the name of the ontology parsed by this module.
170 Example :
171 Returns : value of ontology_name (a scalar)
172 Args : on set, new value (a scalar or undef, optional)
175 =cut
177 sub ontology_name {
178 my $self = shift;
180 return $self->{'ontology_name'} = shift if @_;
181 return $self->{'ontology_name'};
184 =head2 parse
186 Title : parse()
187 Usage : $parser->parse();
188 Function: Parses the files set with "new" or with methods
189 defs_file and _flat_files.
191 Normally you should not need to call this method as it will
192 be called automatically upon the first call to
193 next_ontology().
195 Returns : Bio::Ontology::OntologyEngineI
196 Args :
198 =cut
200 sub parse {
201 my $self = shift;
203 # setup the default term factory if not done by anyone yet
204 $self->term_factory(
205 Bio::Ontology::TermFactory->new( -type => "Bio::Ontology::OBOterm" ) )
206 unless $self->term_factory();
208 ## Parse the file header
209 my $annotations_collection = $self->_header();
211 # create the default ontology object itself
212 my $ont = Bio::Ontology::Ontology->new(
213 -name => $self->ontology_name(),
214 -engine => $self->_ont_engine()
217 ## Assign the file headers
218 $ont->annotation($annotations_collection);
220 # set up the ontology of the relationship types
221 foreach (
222 $self->_part_of_relationship(),
223 $self->_is_a_relationship(),
224 $self->_related_to_relationship(),
225 $self->_regulates_relationship(),
226 $self->_positively_regulates_relationship(),
227 $self->_negatively_regulates_relationship(),
230 $_->ontology($ont);
233 ##################################
234 $self->_add_ontology($ont);
235 ##################################
237 ### Adding new terms
238 while ( my $term = $self->_next_term() ) {
240 ### CHeck if the terms has a valid ID and NAME otherwise ignore the term
241 if ( !$term->identifier() || !$term->name() ) {
242 $self->throw( "OBO File Format Error on line "
243 . $self->{'_current_line_no'}
244 . " \nThe term does not have a id/name tag. This term will be ignored.\n"
246 next;
249 #print $term->identifier(),"\t",$term->name(),"\n";
251 my $new_ontology_flag = 1;
252 my $ontologies_array_ref = $self->{'_ontologies'};
253 foreach my $ontology (@$ontologies_array_ref) {
254 my ($oname, $t_ns) = ($ontology->name(), $term->namespace() );
255 next unless (defined($oname) && defined($t_ns));
256 if ( $oname eq $t_ns ) {
257 ### No need to create new ontology
258 $new_ontology_flag = 0;
259 $ont = $ontology;
263 if ( $new_ontology_flag && $term->namespace() ) {
264 my $new_ont = Bio::Ontology::Ontology->new(
265 -name => $term->namespace(),
266 -engine => $self->_ont_engine()
268 $new_ont->annotation($annotations_collection);
269 $self->_add_ontology($new_ont);
270 $ont = $new_ont;
274 $self->_add_term( $term, $ont );
276 #### Addding the IS_A relationship
277 my $isa_parents_array_ref = $self->{'_isa_parents'};
278 foreach my $parent_term (@$isa_parents_array_ref) {
279 ### Check if parent exist, if not then add the term to the graph.
280 if ( !( $self->_has_term($parent_term) ) ) {
281 $self->_add_term( $parent_term, $ont );
284 $self->_add_relationship( $parent_term, $term,
285 $self->_is_a_relationship(), $ont );
288 #### Addding the other relationships like part_of, realted_to, develpos_from
289 my $relationship_hash_ref = $self->{'_relationships'};
290 foreach my $relationship ( keys %$relationship_hash_ref ) {
291 my $reltype;
292 #### Check if relationship exist, if not add it.
293 if ( $self->_ont_engine->get_relationship_type($relationship) ) {
294 $reltype =
295 $self->_ont_engine->get_relationship_type($relationship);
297 else {
298 $self->_ont_engine->add_relationship_type( $relationship,
299 $ont );
300 $reltype =
301 $self->_ont_engine->get_relationship_type($relationship);
304 #### Check if the id already exist in the graph
305 my $id_array_ref = $$relationship_hash_ref{$relationship};
306 foreach my $id (@$id_array_ref) {
307 my $parent_term = $self->_create_term_object();
308 $parent_term->identifier($id);
309 $parent_term->ontology($ont);
311 if ( !( $self->_has_term($parent_term) ) ) {
312 $self->_add_term( $parent_term, $ont );
315 $self->_add_relationship( $parent_term, $term, $reltype, $ont );
321 return $self->_ont_engine();
322 } # parse
324 =head2 next_ontology
326 Title : next_ontology
327 Usage :
328 Function: Get the next available ontology from the parser. This is the
329 method prescribed by Bio::OntologyIO.
330 Example :
331 Returns : An object implementing Bio::Ontology::OntologyI, and nothing if
332 there is no more ontology in the input.
333 Args :
336 =cut
338 sub next_ontology {
339 my $self = shift;
341 # parse if not done already
342 $self->parse() unless exists( $self->{'_ontologies'} );
344 # return next available ontology
345 if ( exists( $self->{'_ontologies'} ) ) {
346 my $ont = shift( @{ $self->{'_ontologies'} } );
347 if ($ont) {
348 my $store = Bio::Ontology::OntologyStore->new();
349 $store->register_ontology($ont);
351 return $ont;
354 return;
357 =head2 close
359 Title : close
360 Usage :
361 Function: Closes this ontology stream and associated file handles.
363 Clients should call this method especially when they write
364 ontologies.
366 We need to override this here in order to close the file
367 handle for the term definitions file.
369 Example :
370 Returns : none
371 Args : none
374 =cut
376 sub close {
377 my $self = shift;
379 # first call the inherited implementation
380 $self->SUPER::close();
383 # INTERNAL METHODS
384 # ----------------
386 sub _add_ontology {
387 my $self = shift;
388 $self->{'_ontologies'} = [] unless exists( $self->{'_ontologies'} );
389 foreach my $ont (@_) {
390 $self->throw(
391 ref($ont) . " does not implement Bio::Ontology::OntologyI" )
392 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
394 # the ontology name may have been auto-discovered while parsing
395 # the file
396 $ont->name( $self->ontology_name ) unless $ont->name();
397 push( @{ $self->{'_ontologies'} }, $ont );
401 # This simply delegates. See OBOEngine.
402 sub _add_term {
403 my ( $self, $term, $ont ) = @_;
404 $term->ontology($ont) if $ont && ( !$term->ontology );
405 $self->_ont_engine()->add_term($term);
406 } # _add_term
408 # This simply delegates. See OBOEngine
409 sub _part_of_relationship {
410 my $self = shift;
412 return $self->_ont_engine()->part_of_relationship(@_);
413 } # _part_of_relationship
415 # This simply delegates. See OBOEngine
416 sub _is_a_relationship {
417 my $self = shift;
419 return $self->_ont_engine()->is_a_relationship(@_);
420 } # _is_a_relationship
422 # This simply delegates. See OBOEngine
423 sub _related_to_relationship {
424 my $self = shift;
426 return $self->_ont_engine()->related_to_relationship(@_);
427 } # _is_a_relationship
430 # This simply delegates. See OBOEngine
431 sub _regulates_relationship {
432 my $self = shift;
434 return $self->_ont_engine()->regulates_relationship(@_);
435 } # _part_of_relationship
437 # This simply delegates. See OBOEngine
438 sub _positively_regulates_relationship {
439 my $self = shift;
441 return $self->_ont_engine()->positively_regulates_relationship(@_);
442 } # _part_of_relationship
445 # This simply delegates. See OBOEngine
446 sub _negatively_regulates_relationship {
447 my $self = shift;
449 return $self->_ont_engine()->negatively_regulates_relationship(@_);
450 } # _part_of_relationship
452 # This simply delegates. See OBOEngine
453 sub _add_relationship {
454 my ( $self, $parent, $child, $type, $ont ) = @_;
456 # note the triple terminology (subject,predicate,object) corresponds to
457 # (child,type,parent)
458 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
460 } # _add_relationship
462 # This simply delegates. See OBOEngine
463 sub _has_term {
464 my $self = shift;
466 return $self->_ont_engine()->has_term(@_);
467 } # _add_term
469 # Holds the OBO engine to be parsed into
470 sub _ont_engine {
471 my ( $self, $value ) = @_;
473 if ( defined $value ) {
474 $self->{"_ont_engine"} = $value;
477 return $self->{"_ont_engine"};
478 } # _ont_engine
480 # Removes the escape chracters from the file
481 sub _filter_line {
482 my ( $self, $line ) = @_;
484 chomp($line);
485 $line =~ tr [\200-\377]
486 [\000-\177]; # see 'man perlop', section on tr/
487 # weird ascii characters should be excluded
488 $line =~ tr/\0-\10//d; # remove weird characters; ascii 0-8
489 # preserve \11 (9 - tab) and \12 (10-linefeed)
490 $line =~ tr/\13\14//d; # remove weird characters; 11,12
491 # preserve \15 (13 - carriage return)
492 $line =~ tr/\16-\37//d; # remove 14-31 (all rest before space)
493 $line =~ tr/\177//d; # remove DEL character
495 $line =~ s/^\!.*//;
496 $line =~ s/[^\\]\!.*//;
497 $line =~ s/[^\\]\#.*//;
498 $line =~ s/^\s+//;
499 $line =~ s/\s+$//;
501 return $line;
504 # Parses the header
505 sub _header {
506 my $self = shift;
507 my $annotation_collection = Bio::Annotation::Collection->new();
508 my ( $tag, $value );
509 my $line_counter = 0;
510 $self->{'_current_line_no'} = 0;
511 my $format_version_header_flag = 0;
512 my $default_namespace_header_flag = 0;
514 while ( my $line = $self->_readline() ) {
515 ++$line_counter;
516 my $line = $self->_filter_line($line);
517 if ( !$line ) {
518 if ( !$format_version_header_flag || !$default_namespace_header_flag) {
519 $self->throw(
520 "OBO File Format Error - \nCannot find tag format-version and/ default-namespace . These are required header.\n"
524 $self->{'_current_line_no'} = $line_counter;
525 return $annotation_collection;
528 ### CHeck if there is a header
529 if($line =~ /\[\w*\]/) {
530 $self->throw(
531 "OBO File Format Error - \nCannot find tag format-version. Thi ia a required header.\n"
536 ### If the line is not null, check it contains atleasdt one colon
537 $self->_check_colon( $line, $line_counter );
539 ### Thsse ar the allowed headers. Any other headers will be ignored
540 if ( $line =~
541 /^(\[|format-version:|typeref:|version:|date:|saved-by:|auto-generated-by:|default-namespace:|remark:|subsetdef:)/
544 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) {
545 ( $tag, $value ) = ( $1, $2 );
548 if ( $tag =~ /format-version/) {
549 $format_version_header_flag = 1;
550 }elsif( $tag =~ /default-namespace/ ) {
551 $default_namespace_header_flag = 1;
554 my $header = Bio::Annotation::SimpleValue->new( -value => $value );
555 $annotation_collection->add_Annotation( $tag, $header );
557 #### Assign the Ontology name as the value of the default-namespace header
558 if ( $tag =~ /default-namespace/i ) {
560 $self->ontology_name($value);
569 ### Parses each stanza of the file
570 sub _next_term {
571 my $self = shift;
572 my $term ;
573 my $skip_stanza_flag = 1;
574 my $line_counter = $self->{'_current_line_no'};
576 while ( my $line = $self->_readline() ) {
577 #print $line."\n";
578 ++$line_counter;
579 my $line = $self->_filter_line($line);
580 if ( !$line && $term ) {
581 $self->{'_current_line_no'} = $line_counter;
582 return $term;
585 if ( ( $line =~ /^\[(\w+)\]\s*(.*)/ ) ) { #New stanza
587 if ( uc($1) eq "TERM" ) {
589 $term = $self->_create_term_object;
590 $skip_stanza_flag = 0;
591 ### Reset the relationships after each stanza
592 $self->{'_relationships'} = {};
593 $self->{'_isa_parents'} = undef;
595 elsif ( uc($1) eq "TYPEDEF" ) {
596 $skip_stanza_flag = 1;
597 ### Check if this typedef is already defined by the relationship
599 else {
600 $skip_stanza_flag = 1;
601 $self->warn(
602 "OBO File Format Warning on line $line_counter $line \nUnrecognized stanza type found. Skipping this stanza.\n"
605 next;
608 ### If the line is not null, check it contains atleasdt one colon
609 $self->_check_colon( $line, $line_counter );
611 ### if there is any tag value other thn the list below move to the next tag
612 next
613 if (
615 $line !~
616 /^(\[|id:|name:|is_a:|relationship:|namespace:|is_obsolete:|alt_id:|def:|xref_analog:|exact_synonym:|broad_synonym:|related_synonym:|synonym:|comment:)/
618 || $skip_stanza_flag
621 if ( $line =~ /^([\w\-]+)\:\s*(.*)/ ) { #TAg Value pair
622 my ( $tag, $val ) = ( $1, $2 );
624 ### If no value for the tag thrown a warning
625 if ( !$val ) {
626 $self->warn(
627 "OBO File Format Warning on line $line_counter $line \nTag has no value\n"
631 my $qh;
632 ( $val, $qh ) = $self->_extract_quals($val);
633 my $val2 = $val;
634 $val2 =~ s/\\,/,/g;
635 $tag = uc($tag);
636 if ( $tag eq "ID" ) {
638 $term->identifier($val);
639 if ( $self->_has_term($term) ) {
640 $term = $self->_ont_engine()->get_terms($val);
644 elsif ( $tag eq "NAME" ) {
645 $term->name($val);
647 elsif ( $tag eq "XREF_ANALOG" ) {
648 if ( !$term->has_dbxref($val) ) {
649 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
652 elsif ( $tag eq "XREF_UNKNOWN" ) {
653 $term->add_dbxref(-dbxrefs => $self->_to_annotation([$val]));
655 elsif ( $tag eq "NAMESPACE" ) {
656 $term->namespace($val);
658 elsif ( $tag eq "DEF" ) {
659 my ( $defstr, $parts ) = $self->_extract_qstr($val);
660 $term->definition($defstr);
661 my $ann = $self->_to_annotation($parts);
662 $term->add_dbxref(-dbxrefs => $ann);
664 elsif ( $tag =~ /(\w*)synonym/i ) {
665 $val =~ s/['"\[\]]//g;
666 $term->add_synonym($val);
668 elsif ( $tag eq "ALT_ID" ) {
669 $term->add_secondary_id($val);
671 elsif ( $tag eq "IS_OBSOLETE" ) {
673 if ( $val eq 'true' ) {
674 $val = 1;
676 if ( $val eq 'false' ) {
677 $val = 0;
679 $term->is_obsolete($val);
681 elsif ( $tag eq "COMMENT" ) {
682 $term->comment($val);
684 elsif ( $tag eq "RELATIONSHIP" ) {
685 $self->_handle_relationship_tag($val);
687 elsif ( $tag eq "IS_A" ) {
689 $val =~ s/ //g;
690 my $parent_term = $self->_create_term_object();
691 $parent_term->identifier($val);
693 if ( $self->{'_isa_parents'} ) {
694 my $isa_parents_array_ref = $self->{'_isa_parents'};
695 push( @$isa_parents_array_ref, $parent_term );
697 else {
698 my @terms_array;
699 push( @terms_array, $parent_term );
700 $self->{'_isa_parents'} = \@terms_array;
705 return $term;
708 # Creates a Bio::Ontology::OBOterm object
709 sub _create_term_object {
711 my ($self) = @_;
712 my $term = $self->term_factory->create_object();
713 return $term;
718 sub _extract_quals {
719 my ( $self, $str ) = @_;
721 my %q = ();
722 if ( $str =~ /(.*)\s+(\{.*\})\s*$/ ) {
723 my $return_str = $1;
724 my $extr = $2;
725 if ($extr) {
726 my @qparts = $self->_split_on_comma($extr);
727 foreach (@qparts) {
728 if (/(\w+)=\"(.*)\"/) {
729 $q{$1} = $2;
731 elsif (/(\w+)=\'(.*)\'/) {
732 $q{$1} = $2;
734 else {
735 warn("$_ in $str");
739 return ( $return_str, \%q );
741 else {
742 return ( $str, {} );
746 sub _extract_qstr {
747 my ( $self, $str ) = @_;
749 my ( $extr, $rem, $prefix ) = extract_quotelike($str);
750 my $txt = $extr;
751 $txt =~ s/^\"//;
752 $txt =~ s/\"$//;
753 if ($prefix) {
754 warn("illegal prefix: $prefix in: $str");
757 my @extra = ();
759 # eg synonym: "foo" EXACT [...]
760 if ( $rem =~ /(\w+)\s+(\[.*)/ ) {
761 $rem = $2;
762 push( @extra, split( ' ', $1 ) );
765 my @parts = ();
766 while ( ( $extr, $rem, $prefix ) = extract_bracketed( $rem, '[]' ) ) {
767 last unless $extr;
768 $extr =~ s/^\[//;
769 $extr =~ s/\]$//;
770 push( @parts, $extr ) if $extr;
772 @parts =
773 map { $self->_split_on_comma($_) } @parts;
775 $txt =~ s/\\//g;
776 return ( $txt, \@parts, \@extra );
779 sub _split_on_comma {
780 my ( $self, $str ) = @_;
781 my @parts = ();
782 while ( $str =~ /(.*[^\\],\s*)(.*)/ ) {
783 $str = $1;
784 my $part = $2;
785 unshift( @parts, $part );
786 $str =~ s/,\s*$//;
788 unshift( @parts, $str );
789 return map { s/\\//g; $_ } @parts;
792 # This method checks for an existing colon in a line
793 sub _check_colon {
794 my ( $self, $line, $line_no ) = @_;
795 if ( $line && !( $line =~ /:/ ) ) {
796 $self->throw(
797 "OBO File Format Error on line $line_no $line - \nCannot find key-terminating colon\n"
802 # This method handles relationship tags
803 sub _handle_relationship_tag {
804 my ( $self, $val ) = @_;
805 my @parts = split( / /, $val );
806 my $relationship = uc($parts[0]);
807 my $id = $parts[1] =~ /\^(w+)\s+\!/ ? $1 : $parts[1];
808 my $parent_term = $self->_create_term_object();
809 $parent_term->identifier($id);
811 if ( my $realtionships_hash = $self->{'_relationships'} ) {
812 my $id_array_ref = $$realtionships_hash{$relationship};
813 if ( !$id_array_ref ) {
814 my @ids;
815 push( @ids, $id );
816 $$realtionships_hash{$relationship} = \@ids;
819 else {
820 push( @$id_array_ref, $id );
827 # convert simple strings to Bio::Annotation::DBLinks
828 sub _to_annotation {
829 my ($self , $links) = @_;
830 return unless $links;
831 my @dbxrefs;
832 for my $string (@{$links}) {
833 my ($db, $id) = split(':',$string);
834 push @dbxrefs, Bio::Annotation::DBLink->new(-database => $db, -primary_id => $id);
836 return \@dbxrefs;