sync with trunk to r15684
[bioperl-live.git] / Bio / SeqIO / entrezgene.pm
blobe8fdc254b7f61af34cb56a3617d5d8a9b26aa828
1 # $Id$
2 # BioPerl module for Bio::SeqIO::entrezgene
4 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::SeqIO::entrezgene - Entrez Gene ASN1 parser
12 =head1 SYNOPSIS
14 use Bio::SeqIO;
16 # don't instantiate directly - instead do
17 my $seqio = Bio::SeqIO->new(-format => 'entrezgene',
18 -file => $file);
19 my $gene = $seqio->next_seq;
21 =head1 DESCRIPTION
23 This is EntrezGene ASN bioperl parser. It is built on top of
24 L<Bio::ASN1::EntrezGene>, a low level ASN parser built by Mingyi Liu
25 (L<http://sourceforge.net/projects/egparser>). The easiest way to
26 use it is shown above.
28 You will get most of the Entrez Gene annotation such as gene symbol,
29 gene name and description, accession numbers associated
30 with the gene, etc. Almost all of these are given as L<Bio::AnnotationI> objects.
32 If you need all the data do:
34 my $seqio = Bio::SeqIO->new(-format => 'entrezgene',
35 -file => $file,
36 -debug => 'on' );
37 my ($gene,$genestructure,$uncaptured) = $seqio->next_seq;
39 The second variable returned, C<$genestructure>, is a L<Bio::Cluster::SequenceFamily>
40 object. It contains all Refseqs and the genomic contigs that are associated
41 with the particular gene. The third variable, C<$uncaptured>, is a reference
42 to a plain array.
44 You can also modify the output to allow back compatibility with the old
45 LocusLink parser:
47 my $seqio = Bio::SeqIO->new(-format => 'entrezgene',
48 -file => $file,
49 -locuslink => 'convert');
51 The C<-debug> and C<-locuslink> options slow down the parser.
53 Example code which looks for ontology terms:
55 my $eio = new Bio::SeqIO(-file => $file,
56 -format => 'entrezgene',
57 -service_record => 'yes');
59 while (my $seq = $eio->next_seq) {
60 my $gid = $seq->accession_number;
61 foreach my $ot ($ann->get_Annotations('OntologyTerm')) {
62 next if ($ot->term->authority eq 'STS marker'); # No STS markers
63 my $evid = $ot->comment;
64 $evid =~ s/evidence: //i;
65 my @ref = $ot->term->get_references;
66 my $id = $ot->identifier;
67 my $fid = 'GO:' . sprintf("%07u",$id);
68 print join("\t",$gid, $ot->ontology->name, $ot->name, $evid,
69 $fid, @ref?$ref[0]->medline:''), "\n";
73 =head1 FEEDBACK
75 =head2 Mailing Lists
77 User feedback is an integral part of the evolution of this and other
78 Bioperl modules. Send your comments and suggestions preferably to
79 the Bioperl mailing list. Your participation is much appreciated.
81 bioperl-l@bioperl.org - General discussion
82 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
84 =head2 Support
86 Please direct usage questions or support issues to the mailing list:
88 L<bioperl-l@bioperl.org>
90 rather than to the module maintainer directly. Many experienced and
91 reponsive experts will be able look at the problem and quickly
92 address it. Please include a thorough description of the problem
93 with code and data examples if at all possible.
95 =head2 Reporting Bugs
97 Report bugs to the Bioperl bug tracking system to help us keep track
98 of the bugs and their resolution. Bug reports can be submitted via
99 the web:
101 http://bugzilla.open-bio.org/
103 =head1 AUTHOR - Stefan Kirov
105 Email skirov at utk.edu
107 =head1 CONTRIBUTORS
109 Hilmar Lapp, hlapp at gmx.net
111 =head1 APPENDIX
113 This parser is based on Bio::ASN1::EntrezGene module.
115 The rest of the documentation details each of the object methods.
116 Internal methods are usually preceded with a _
118 =cut
120 package Bio::SeqIO::entrezgene;
122 use strict;
123 use Bio::ASN1::EntrezGene;
124 use Bio::Seq;
125 use Bio::Species;
126 use Bio::Annotation::SimpleValue;
127 use Bio::Annotation::DBLink;
128 use Bio::Annotation::Comment;
129 use Bio::SeqFeature::Generic;
130 use Bio::Annotation::Reference;
131 use Bio::SeqFeature::Gene::Exon;
132 use Bio::SeqFeature::Gene::Transcript;
133 use Bio::SeqFeature::Gene::GeneStructure;
134 use Bio::Cluster::SequenceFamily;
136 #use Bio::Ontology::Ontology; Relationships.... later
137 use Bio::Ontology::Term;
138 use Bio::Annotation::OntologyTerm;
139 use Data::Dumper;
140 use base qw(Bio::SeqIO);
142 %main::eg_to_ll = (
143 'Official Full Name' => 'OFFICIAL_GENE_NAME',
144 'chromosome' => 'CHR',
145 'cyto' => 'MAP',
146 'Official Symbol' => 'OFFICIAL_SYMBOL'
148 @main::egonly = keys %main::eg_to_ll;
150 # We define $xval and some other variables so we don't have
151 # to pass them as arguments
152 my ( $seq, $ann, $xval, %seqcollection, $buf );
154 sub _initialize {
155 my ( $self, @args ) = @_;
156 $self->SUPER::_initialize(@args);
157 my %param = @args;
158 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
159 $self->{_debug} = $param{-debug} || 'off';
160 $self->{_locuslink} = $param{-locuslink} || 'no';
161 $self->{_service_record} = $param{-service_record} || 'no';
162 $self->{_parser} = Bio::ASN1::EntrezGene->new( file => $param{-file} );
164 #Instantiate the low level parser here (it is -file in Bioperl
165 #-should tell M.)
166 #$self->{_parser}->next_seq; #First empty record- bug in Bio::ASN::Parser
169 sub next_seq {
170 my $self = shift;
171 my $value = $self->{_parser}->next_seq(1);
173 # $value contains data structure for the
174 # record being parsed. 2 indicates the recommended
175 # trimming mode of the data structure
176 #I use 1 as I prefer not to descend into size 0 arrays
177 return unless ($value);
178 my $debug = $self->{_debug};
179 $self->{_ann} = Bio::Annotation::Collection->new();
180 $self->{_currentann} = Bio::Annotation::Collection->new();
181 my @alluncaptured;
183 # parse the entry
184 #my @keys=keys %{$value}; obsolete
185 $xval = $value->[0];
187 #return unless ($xval->{gene}->{desc} eq 'albumin');
188 #return new Bio::Seq (-id=>'Generif service record', -seq=>'')
189 # unless ($xval->{'track-info'}{geneid}== 283);
190 return Bio::Seq->new( -id => 'Generif service record', -seq => '' )
191 if (
192 ( $self->{_service_record} ne 'yes' )
193 && ( $xval->{gene}->{desc} =~
194 /record to support submission of generifs for a gene not in entrez/i
198 #Basic data
199 #$xval->{summary}=~s/\n//g;
200 my $seq = Bio::Seq->new(
201 -display_id => $xval->{gene}{locus},
202 -accession_number => $xval->{'track-info'}{geneid},
203 -desc => $xval->{summary}
206 #Source data here
207 $self->_add_to_ann( $xval->{'track-info'}->{status}, 'Entrez Gene Status' );
208 my $lineage = $xval->{source}{org}{orgname}{lineage};
209 $lineage =~ s/[\s\n]//g;
210 my ( $comp, @lineage );
211 while ($lineage) {
212 ( $comp, $lineage ) = split( /;/, $lineage, 2 );
213 unshift @lineage, $comp;
215 unless ( exists( $xval->{source}{org}{orgname}{name}{binomial} ) ) {
216 shift @lineage;
217 my ( $gen, $sp ) = split( /\s/, $xval->{source}{org}{taxname} );
218 if ( ($sp) && ( $sp ne '' ) ) {
219 if ( $gen =~ /plasmid/i ) {
220 $sp = $gen . $sp;
222 unshift @lineage, $sp;
224 else {
225 unshift @lineage, 'unknown';
228 else {
229 my $sp = $xval->{source}{org}{orgname}{name}{binomial}{species};
230 if ( ($sp) && ( $sp ne '' ) ) {
231 my ( $spc, $strain ) = split( 'sp.', $sp ); #Do we need strain?
232 $spc =~ s/\s//g;
233 if ( ($spc) && ( $spc ne '' ) ) {
234 unshift @lineage, $spc;
236 else {
237 unshift @lineage, 'unknown';
240 else {
241 unshift @lineage, 'unknown';
245 #print Dumper($xval->{source}{org});
246 my $ncbiid;
247 if ( ref( $xval->{source}{org}{db} ) eq 'ARRAY' ) {
248 foreach my $taxonomy ( @{ $xval->{source}{org}{db} } ) {
249 if ( lc( $taxonomy->{db} ) eq 'taxon' ) {
250 $ncbiid = $taxonomy->{tag}{id};
252 else {
253 push @alluncaptured, $taxonomy;
255 delete $xval->{source}{org}{db};
258 $ncbiid = $ncbiid || $xval->{source}{org}{db}{tag}{id};
259 my $s1 = shift @lineage;
260 my $s2 = shift @lineage;
261 my $specie = Bio::Species->new(
262 -classification => [ $s1, $s2 ],
263 -ncbi_taxid => $ncbiid
265 $specie->common_name( $xval->{source}{org}{common} );
266 if ( exists( $xval->{source}->{subtype} )
267 && ( $xval->{source}->{subtype} ) )
269 if ( ref( $xval->{source}->{subtype} ) eq 'ARRAY' ) {
270 foreach my $subtype ( @{ $xval->{source}->{subtype} } ) {
271 $self->_add_to_ann( $subtype->{name}, $subtype->{subtype} );
274 else {
275 $self->_add_to_ann(
276 $xval->{source}->{subtype}->{name},
277 $xval->{source}->{subtype}->{subtype}
282 #Synonyms
283 if ( ref( $xval->{gene}->{syn} ) eq 'ARRAY' ) {
284 foreach my $symsyn ( @{ $xval->{gene}->{syn} } ) {
285 $self->_add_to_ann( $symsyn, 'ALIAS_SYMBOL' );
288 else {
289 $self->_add_to_ann( $xval->{gene}->{syn}, 'ALIAS_SYMBOL' )
290 if ( $xval->{gene}->{syn} );
293 #COMMENTS (STS not dealt with yet)
294 if ( exists( $xval->{comments} ) ) {
295 if ( ref( $xval->{comments} ) eq 'ARRAY' ) {
296 for my $i ( 0 .. $#{ $xval->{comments} } ) {
297 $self->{_current} = $xval->{comments}->[$i];
298 push @alluncaptured, $self->_process_all_comments();
301 else {
302 $self->{_current} = $xval->{comments};
303 push @alluncaptured, $self->_process_all_comments();
307 #Gene
308 if ( exists( $xval->{gene}->{db} ) ) {
309 if ( ref( $xval->{gene}->{db} ) eq 'ARRAY' ) {
310 foreach my $genedb ( @{ $xval->{gene}->{db} } ) {
311 my $id =
312 exists( $genedb->{tag}->{id} )
313 ? $genedb->{tag}->{id}
314 : $genedb->{tag}->{str};
315 $self->_add_to_ann( $id, $genedb->{db} );
318 else {
319 my $id =
320 ( $xval->{gene}->{db}->{tag}->{id} )
321 ? $xval->{gene}->{db}->{tag}->{id}
322 : $xval->{gene}->{db}->{tag}->{str};
323 $self->_add_to_ann( $id, $xval->{gene}->{db}->{db} );
325 $self->_add_to_ann( $xval->{gene}->{'locus-tag'}, 'LOCUS_SYNONYM' );
326 delete $xval->{gene}->{db} unless ( $debug eq 'off' );
329 #LOCATION To do: uncaptured stuff
330 if ( exists( $xval->{location} ) ) {
331 if ( ref( $xval->{location} ) eq 'ARRAY' ) {
332 foreach my $loc ( @{ $xval->{location} } ) {
333 $self->_add_to_ann( $loc->{'display-str'},
334 $loc->{method}->{'map-type'} );
337 else {
338 $self->_add_to_ann(
339 $xval->{location}->{'display-str'},
340 $xval->{location}->{method}->{'map-type'}
343 delete $xval->{location} unless ( $debug eq 'off' );
346 #LOCUS
347 if ( ref( $xval->{locus} ) eq 'ARRAY' ) {
348 foreach my $locus ( @{ $xval->{locus} } ) {
349 $self->{_current} = $locus;
350 push @alluncaptured, $self->_process_locus();
353 else {
354 push @alluncaptured, $self->_process_locus( $xval->{locus} );
357 #Homology
358 my ( $uncapt, $hom, $anchor ) = _process_src( $xval->{homology}->{source} );
359 foreach my $homann (@$hom) {
360 $self->{_ann}->add_Annotation( 'dblink', $homann );
362 push @alluncaptured, $uncapt;
364 #Index terms
365 if ( ( exists( $xval->{'xtra-index-terms'} ) )
366 && ( $xval->{'xtra-index-terms'} ) )
368 if ( ref( $xval->{'xtra-index-terms'} ) eq 'ARRAY' ) {
369 foreach my $term ( @{ $xval->{'xtra-index-terms'} } ) {
370 $self->_add_to_ann( $term, 'Index terms' );
373 else {
374 $self->_add_to_ann( $xval->{'xtra-index-terms'}, 'Index terms' );
378 #PROPERTIES
379 my @prop;
380 if ( exists( $xval->{properties} ) ) {
381 if ( ref( $xval->{properties} ) eq 'ARRAY' ) {
382 foreach my $property ( @{ $xval->{properties} } ) {
383 push @alluncaptured, $self->_process_prop($property);
386 else {
387 push @alluncaptured, $self->_process_prop( $xval->{properties} );
390 $seq->annotation( $self->{_ann} );
391 $seq->species($specie);
392 my @seqs;
393 foreach my $key ( keys %seqcollection )
394 { #Optimize this, no need to go through hash?
395 push @seqs, @{ $seqcollection{$key} };
397 my $cluster = Bio::Cluster::SequenceFamily->new(
398 -family_id => $seq->accession_number,
399 -description => "Entrez Gene " . $seq->accession_number,
400 -members => \@seqs
401 ); #Our EntrezGene object
402 #clean
403 unless ( $debug eq 'off' ) {
404 delete $xval->{homology}->{source};
405 delete( $xval->{summary} );
406 delete( $xval->{'track-info'} );
407 delete( $xval->{gene}{locus} );
408 delete( $xval->{source}{org}{orgname}{lineage} );
409 delete $xval->{source}{org}{orgname}{name}{binomial}{species};
410 delete $xval->{gene}{syn};
411 delete $xval->{source}->{subtype};
412 delete $xval->{comments};
413 delete $xval->{properties};
414 delete $xval->{'xtra-index-terms'};
415 delete $xval->{status};
417 push @alluncaptured, $xval;
418 undef %seqcollection;
419 $seq->annotation( _backcomp_ll( $self->{_ann} ) )
420 if ( $self->{_locuslink} eq 'convert' ); #Fix this!
421 return
422 wantarray
423 ? ( $seq, $cluster, \@alluncaptured )
424 : $seq; #Hilmar's suggestion
427 sub _process_refseq {
428 my $self = shift;
429 my $products = shift;
430 my $ns = shift;
431 my $iter = shift;
432 $iter++;
433 my $pid;
434 my ( @uncaptured, @products );
435 if ( ref($products) eq 'ARRAY' ) { @products = @{$products}; }
436 else { push @products, $products; }
438 foreach my $product (@products) {
439 if ( ( ref($product) eq 'ARRAY' ) && ( $#{$product} > -1 ) ) {
440 $self->_process_refseq( $product, $ns, $iter );
441 next;
443 if (
445 exists( $product->{products} )
446 && ( !exists( $product->{accession} ) )
450 $self->_process_refseq( $product->{products}, $ns );
451 next;
454 #if ((exists($product->{products})&&($product->{products}))) {
455 # $self->_process_refseq($product->{products},$ns,$iter);
457 if ( ( exists( $product->{seqs}->{whole}->{gi} ) )
458 && ( ref( $product->{seqs}->{whole}->{gi} ) eq 'ARRAY' ) )
460 $product->{seqs}->{whole}->{gi} =
461 $product->{seqs}->{whole}->{gi}->[0];
462 } #Lose some data
463 if ( ( exists( $product->{seqs}->{whole}->{gi} ) )
464 || ( exists( $product->{accession} ) ) )
465 { #Minimal data required
466 my $cann = Bio::Annotation::Collection->new();
467 $pid = $product->{accession};
468 my $authority =
469 exists( $product->{type} )
470 ? $product->{type}
471 : $product->{heading};
472 my $nseq = Bio::Seq->new(
473 -accession_number => $product->{seqs}->{whole}->{gi},
474 -display_id => $product->{accession},
475 -authority => $authority,
476 -namespace => $ns
478 if ( exists( $product->{source} ) && ( $product->{source} ) ) {
479 if ( ( !defined( $nseq->authority ) )
480 && ( exists( $product->{source}->{src} ) )
481 && ( exists( $product->{source}->{src}->{db} ) ) )
483 $nseq->authority( $product->{source}->{src}->{db} );
485 my ( $uncapt, $allann ) = _process_src( $product->{source} );
486 push @uncaptured, $uncapt;
487 delete $product->{source};
488 foreach my $annotation ( @{$allann} ) {
489 $cann->add_Annotation( 'dblink', $annotation );
493 delete $product->{seqs}->{whole}->{gi};
494 delete $product->{accession};
495 delete $product->{source};
496 delete $product->{heading};
497 my ( $uncapt, $ann, $cfeat ) =
498 $self->_process_comments( $product->{comment} )
499 if ( exists( $product->{comment} ) );
500 push @uncaptured, $uncapt;
501 foreach my $feat ( @{$cfeat} ) {
502 $nseq->add_SeqFeature($feat);
504 if ( exists( $product->{products} ) && ( $product->{products} ) ) {
505 my ( $uncapt, $prodid ) =
506 $self->_process_refseq( $product->{products} );
507 push @uncaptured, $uncapt;
508 my $simann = Bio::Annotation::SimpleValue->new(
509 -value => $prodid,
510 -tagname => 'product'
512 $cann->add_Annotation($simann);
514 foreach my $key ( keys %$ann ) {
515 foreach my $val ( @{ $ann->{$key} } ) {
516 $cann->add_Annotation( $key, $val );
519 $nseq->annotation($cann);
520 push @{ $seqcollection{seq} }, $nseq;
523 undef @products;
524 undef $products;
526 #my $ti2=new Benchmark;
527 # my $td= timediff($ti2, $ti1);
528 # print "\tITER $iter:",timestr($td),"\n";
529 return \@uncaptured, $pid, $seqcollection{seq}, $iter;
532 sub _process_links {
533 my $self = shift;
534 my $links = shift;
535 my ( @annot, @uncapt );
536 if ( ref($links) eq 'ARRAY' ) {
537 foreach my $link (@$links) {
538 my ( $uncapt, $annot ) = _process_src( $link->{source} )
539 if ( exists( $link->{source} ) );
540 push @uncapt, $uncapt;
541 foreach my $annotation (@$annot) {
542 $self->{_ann}->add_Annotation( 'dblink', $annotation );
546 else {
547 my ( $uncapt, $annot ) = _process_src( $links->{source} )
548 if ( exists( $links->{source} ) );
549 push @uncapt, $uncapt;
550 foreach my $annotation (@$annot) {
551 $self->{_ann}->add_Annotation( 'dblink', $annotation );
554 return @uncapt;
557 sub _add_to_ann { #Highest level only
558 my ( $self, $val, $tag ) = @_;
560 # $val=~s/\n//g;#Low level EG parser leaves this so we take care of them here
561 unless ($tag) {
562 $self->warn( "No tagname for value $val, tag $tag " . $seq->id . "\n" );
563 return;
565 my $simann =
566 Bio::Annotation::SimpleValue->new( -value => $val, -tagname => $tag );
567 $self->{_ann}->add_Annotation($simann);
570 sub _process_comments {
571 my $self = shift;
572 my $prod = shift;
573 my ( %cann, @feat, @uncaptured, @comments, @sfann );
574 if ( ( ref($prod) eq 'HASH' ) && ( exists( $prod->{comment} ) ) ) {
575 $prod = $prod->{comment};
577 if ( ref($prod) eq 'ARRAY' ) {
578 @comments = @{$prod};
580 else {
581 push @comments, $prod;
583 my $i = 0;
584 for my $comm (@comments) { # Each comments is a hash reference
585 $self->throw("Comment not a hash reference")
586 unless ref($comm) eq 'HASH';
587 my ( $desc, $nfeat, $add, @ann, @comm );
589 # next unless (exists($comm->{comment}));#Should be more careful when calling _process_comment:To do
590 my $heading = $comm->{heading} || 'description';
591 if ( !exists( $comm->{comment} ) ) {
592 if ( ( exists( $comm->{type} ) )
593 && ( $comm->{type} )
594 && ( $self->{_current_heading} ) )
596 $comm->{type} = $self->{_current_heading};
598 if ( ( exists( $comm->{source} ) )
599 && ( exists( $comm->{type} ) )
600 && ( exists( $comm->{text} ) )
601 && ( $comm->{type} ne 'comment' ) )
603 my ( $uncapt, $annot, $anchor ) =
604 _process_src( $comm->{source} );
605 my $cann = shift(@$annot);
606 if ( defined $cann ) {
607 $cann->optional_id( $comm->{text} );
608 $cann->authority( $comm->{type} );
609 $cann->version( $comm->{version} );
610 push @sfann, $cann;
615 while (ref($comm) eq 'HASH'
616 && ( exists( $comm->{comment} ) )
617 && ( $comm->{comment} ) )
619 if ( ( exists( $comm->{source} ) ) && ( $comm->{source} ) ) {
620 my ( $uncapt, $allann, $anchor ) =
621 _process_src( $comm->{source} );
622 if ($allann) {
623 delete $comm->{source};
624 push @uncaptured, $uncapt;
625 foreach my $annotation ( @{$allann} ) {
626 if ( $annotation->{_anchor} ) {
627 $desc .= $annotation->{_anchor} . ' ';
629 $annotation->optional_id($heading);
630 push @sfann, $annotation;
631 push @{ $cann{'dblink'} }, $annotation;
636 $comm = $comm->{comment};
638 if ( ref($comm) eq 'ARRAY' ) {
639 @comm = @{$comm};
641 else {
642 push @comm, $comm if ($comm);
645 foreach my $ccomm (@comm) {
646 next unless ($ccomm);
647 if ( exists( $ccomm->{source} ) ) {
648 my ( $uncapt, $allann, $anchor ) =
649 _process_src( $ccomm->{source} );
650 if ($allann) {
651 @sfann = @{$allann};
652 delete $ccomm->{source};
653 push @uncaptured, $uncapt;
656 $ccomm = $ccomm->{comment}
657 if ( exists( $ccomm->{comment} ) ); #Alice in Wonderland???
658 my @loc;
659 if ($ccomm) {
660 if ( ref($ccomm) eq 'ARRAY' ) {
661 @loc = @{$ccomm};
663 else {
664 push @loc, $ccomm;
667 foreach my $loc (@loc) {
668 if ( ( exists( $loc->{text} ) )
669 && ( $loc->{text} =~ /Location/i ) )
671 my ( $l1, $rest ) = split( /-/, $loc->{text} );
672 $l1 =~ s/\D//g;
673 $rest =~ s/^\s//;
674 my ( $l2, $scorestr ) = split( /\s/, $rest, 2 );
675 my ( $scoresrc, $score ) = split( /:/, $scorestr );
676 $score =~ s/\D//g;
677 my ( %tags, $tag );
678 unless ($l1) {
679 next;
681 $nfeat = Bio::SeqFeature::Generic->new(
682 -start => $l1,
683 -end => $l2,
684 -strand => $tags{strand},
685 -source => $loc->{type},
686 -seq_id => $desc,
687 -primary => $heading,
688 -score => $score,
689 -tag => { score_src => $scoresrc }
691 my $sfeatann = Bio::Annotation::Collection->new();
692 foreach my $sfann (@sfann) {
693 $sfeatann->add_Annotation( 'dblink', $sfann );
695 undef @sfann;
696 $nfeat->annotation($sfeatann)
697 ; #Thus the annotation will be available both in the seq and seqfeat?
698 push @feat, $nfeat;
699 delete $loc->{text};
700 delete $loc->{type};
702 elsif ( exists( $loc->{label} ) ) {
703 my $simann = Bio::Annotation::SimpleValue->new(
704 -value => $loc->{text},
705 -tagname => $loc->{label}
707 delete $loc->{text};
708 delete $loc->{label};
709 push @{ $cann{'simple'} }, $simann;
710 push @uncaptured, $loc;
712 elsif ( exists( $loc->{text} ) ) {
713 my $simann = Bio::Annotation::SimpleValue->new(
714 -value => $loc->{text},
715 -tagname => $heading
717 delete $loc->{text};
718 push @{ $cann{'simple'} }, $simann;
719 push @uncaptured, $loc;
722 } #Bit clumsy but that's what we get from the low level parser
724 $i++;
726 if (@sfann) {
727 push @{ $cann{'dblink'} }, @sfann;
728 } #Annotation that is not location specific, for example phenotype
729 #undef $self->{_current_heading};
730 return \@uncaptured, \%cann, \@feat;
733 sub _process_src {
734 my $src = shift;
736 #Trick we do because sometimes we have an array ref
737 my ( @ann, $anch, @uncapt );
738 if ( ref($src) eq 'ARRAY' ) {
739 foreach my $msrc (@$src) {
740 my ( $uncapt, $ann, $anchor ) = _process_src($msrc);
741 push @ann, @$ann;
742 push @uncapt, $uncapt;
743 $anch = $anchor;
745 return \@uncapt, \@ann, $anch;
747 return unless ( exists( $src->{src}->{tag} ) );
749 #my $t0=new Benchmark
750 my $db = $src->{src}->{db};
751 delete $src->{src}->{db};
752 my $anchor = $src->{anchor} || '';
753 delete $src->{anchor};
754 my $url;
755 if ( exists( $src->{url} ) && ( $src->{url} ) ) {
756 $url = $src->{url};
757 $url =~ s/\n//g;
758 delete $src->{url};
760 if ( ( exists( $src->{src}->{tag}->{str} ) )
761 && ( $src->{src}->{tag}->{str} ) )
763 my @sq = split( /[,;]/, $src->{src}->{tag}->{str} );
764 delete $src->{src}->{tag};
765 foreach my $id (@sq) {
766 $id =~ s/\n//g;
767 undef $anchor if ( $anchor eq 'id' );
768 my $simann = Bio::Annotation::DBLink->new(
769 -database => $db,
770 -primary_id => $id,
771 -authority => $src->{heading}
773 $simann->url($url) if ($url); #DBLink should have URL!
774 push @ann, $simann;
777 else {
778 my $id = $src->{src}->{tag}->{id} || '';
779 delete $src->{src}->{tag};
780 undef $anchor if ( $anchor eq 'id' );
781 $id =~ s/\n//g;
782 my $simann = Bio::Annotation::DBLink->new(
783 -database => $db,
784 -primary_id => $id,
785 -authority => $src->{heading}
787 if ($anchor) {
788 $simann->{_anchor} = $anchor;
789 $simann->optional_id($anchor);
791 $simann->url($url) if ($url); #DBLink should have URL!
792 push @ann, $simann;
795 #my $t1=new Benchmark;
796 #my $td= timediff($t1, $t0);
797 #print "\t\tSRC:",timestr($td),"\n";
798 return $src, \@ann, $anchor;
801 sub _add_references {
802 my $self = shift;
803 my $refs = shift;
804 if ( ref($refs) eq 'ARRAY' ) {
805 foreach my $ref (@$refs) {
806 my $refan = Bio::Annotation::Reference->new(
807 -database => 'Pubmed',
808 -primary_id => $ref
810 $self->{_ann}->add_Annotation( 'Reference', $refan );
813 else {
814 my $refan = Bio::Annotation::Reference->new(
815 -database => 'Pubmed',
816 -primary_id => $refs
818 $self->{_ann}->add_Annotation( 'Reference', $refan );
822 #Should we do this at all if no seq coord are present?
823 sub _process_locus {
824 my $self = shift;
825 my @uncapt;
826 return $self
827 unless ( exists( $self->{_current}->{accession} )
828 && ( $self->{_current}->{accession} ) );
829 my $gseq = Bio::Seq->new(
830 -display_id => $self->{_current}->{accession},
831 -version => $self->{_current}->{version},
832 -accession_number => $self->{_current}->{seqs}->{'int'}->{id}->{gi},
833 -authority => $self->{_current}->{type},
834 -namespace => $self->{_current}->{heading}
836 delete $self->{_current}->{accession};
837 delete $self->{_current}->{version};
838 delete $self->{_current}->{'int'}->{id}->{gi};
839 my ( $start, $end, $strand );
841 if ( exists( $self->{_current}->{seqs}->{'int'}->{from} ) ) {
842 $start = $self->{_current}->{seqs}->{'int'}->{from};
843 delete $self->{_current}->{seqs}->{'int'}->{from};
845 #unless ($start) {print $locus->{seqs}->{'int'}->{from},"\n",$locus,"\n";}
846 $end = $self->{_current}->{seqs}->{'int'}->{to};
847 delete $self->{_current}->{seqs}->{'int'}->{to};
848 delete $self->{_current}->{seqs}->{'int'}->{strand};
849 $strand =
850 $self->{_current}->{seqs}->{'int'}->{strand} eq 'minus' ? -1 : 1
851 if ( exists( $self->{_current}->{seqs}->{'int'}->{strand} ) )
852 ; #1 being default
853 my $nfeat = Bio::SeqFeature::Generic->new(
854 -start => $start,
855 -end => $end,
856 -strand => $strand,
857 primary => 'gene location'
859 $gseq->add_SeqFeature($nfeat);
861 my @products;
862 if ( ref( $self->{_current}->{products} ) eq 'ARRAY' ) {
863 @products = @{ $self->{_current}->{products} };
865 else {
866 push @products, $self->{_current}->{products};
868 delete $self->{_current}->{products};
869 my $gstruct = Bio::SeqFeature::Gene::GeneStructure->new();
870 foreach my $product (@products) {
871 my ( $tr, $uncapt ) =
872 _process_products_coordinates( $product, $start, $end, $strand );
873 $gstruct->add_transcript($tr) if ($tr);
874 undef $tr->{parent}; #Because of a cycleG
875 push @uncapt, $uncapt;
877 $gseq->add_SeqFeature($gstruct);
878 push @{ $seqcollection{genestructure} }, $gseq;
879 return @uncapt;
882 =head1 _process_products_coordinates
884 To do:
886 =cut
888 sub _process_products_coordinates {
889 my $coord = shift;
890 my $start = shift
891 || 0; #In case it is not known: should there be an entry at all?
892 my $end = shift || 1;
893 my $strand = shift || 1;
894 my ( @coords, @uncapt );
895 return unless ( exists( $coord->{accession} ) );
896 my $transcript = Bio::SeqFeature::Gene::Transcript->new(
897 -primary => $coord->{accession}, #Desc is actually non functional...
898 -start => $start,
899 -end => $end,
900 -strand => $strand,
901 -desc => $coord->{type}
904 if ( ( exists( $coord->{'genomic-coords'}->{mix}->{'int'} ) )
905 || ( exists( $coord->{'genomic-coords'}->{'packed-int'} ) ) )
907 @coords =
908 exists( $coord->{'genomic-coords'}->{mix}->{'int'} )
909 ? @{ $coord->{'genomic-coords'}->{mix}->{'int'} }
910 : @{ $coord->{'genomic-coords'}->{'packed-int'} };
911 foreach my $exon (@coords) {
912 next unless ( exists( $exon->{from} ) );
913 my $exonobj = Bio::SeqFeature::Gene::Exon->new(
914 -start => $exon->{from},
915 -end => $exon->{to},
916 -strand => $strand
918 $transcript->add_exon($exonobj);
919 delete $exon->{from};
920 delete $exon->{to};
921 delete $exon->{strand};
922 push @uncapt, $exon;
925 my ( $prot, $uncapt );
926 if ( exists( $coord->{products} ) ) {
927 my ( $prot, $uncapt ) =
928 _process_products_coordinates( $coord->{products},
929 $start, $end, $strand );
930 $transcript->add_SeqFeature($prot);
931 push @uncapt, $uncapt;
933 return $transcript, \@uncapt;
936 =head1 _process_prop
938 To do: process GO
940 =cut
942 sub _process_prop {
943 my $self = shift;
944 my $prop = shift;
945 my @uncapt;
946 if ( exists( $prop->{properties} ) ) { #Iterate
947 if ( ref( $prop->{properties} ) eq 'ARRAY' ) {
948 foreach my $propn ( @{ $prop->{properties} } ) {
949 push @uncapt, $self->_process_prop($propn);
952 else {
953 push @uncapt, $self->_process_prop( $prop->{properties} );
956 unless ( ( exists( $prop->{heading} ) )
957 && ( $prop->{heading} eq 'GeneOntology' ) )
959 $self->_add_to_ann( $prop->{text}, $prop->{label} )
960 if ( exists( $prop->{text} ) );
961 delete $prop->{text};
962 delete $prop->{label};
963 push @uncapt, $prop;
964 return \@uncapt;
967 #Will do GO later
968 if ( exists( $prop->{comment} ) ) {
969 push @uncapt, $self->_process_go( $prop->{comment} );
973 sub _process_all_comments {
974 my $self = shift;
975 my $product = $self->{_current}; #Better without copying
976 my @alluncaptured;
977 my $heading = $product->{heading} if ( exists( $product->{heading} ) );
978 if ($heading) {
980 #my $tx1=new Benchmark;
981 delete $product->{heading};
982 CLASS: {
983 if ( $heading =~ 'RefSeq Status' )
984 { #IN case NCBI changes slightly the spacing:-)
985 $self->_add_to_ann( $product->{label}, 'RefSeq status' );
986 last CLASS;
988 if ( $heading =~ 'NCBI Reference Sequences' )
989 { #IN case NCBI changes slightly the spacing:-)
990 if ( ( exists( $product->{comment} ) )
991 && ( !exists( $product->{products} ) ) )
993 $product->{products} = $product->{comment};
996 #unless (($product->{products})&&(exists($product->{comment}))) {
997 #if (ref ($product->{comment}) eq 'ARRAY') {
998 # foreach my $pc (@{$product->{comment}}) {
999 # push @{$product->{products}},$pc->{products};
1002 #else {
1003 # $product->{products}=exists($product->{comments}->{products})?$product->{comments}->{products}:$product->{comment};
1006 my @uncaptured =
1007 $self->_process_refseq( $product->{products}, 'refseq' );
1008 push @alluncaptured, @uncaptured;
1009 last CLASS;
1011 if ( ( $heading =~ 'Related Sequences' )
1012 && ( exists( $product->{products} ) ) )
1013 { #IN case NCBI changes slightly the spacing:-)
1014 my @uncaptured = $self->_process_refseq( $product->{products} );
1015 push @alluncaptured, @uncaptured;
1016 last CLASS;
1018 if ( ( $heading =~ 'Additional Links' )
1019 && ( exists( $product->{comment} ) ) )
1020 { #IN case NCBI changes slightly the spacing:-)
1021 push @alluncaptured,
1022 $self->_process_links( $product->{comment} );
1023 last CLASS;
1025 if ( $heading =~ 'LocusTagLink' )
1026 { #IN case NCBI changes slightly the spacing:-)
1027 $self->_add_to_ann(
1028 $product->{source}->{src}->{tag}->{id},
1029 $product->{source}->{src}->{db}
1031 last CLASS;
1033 if ( ( $heading =~ 'Sequence Tagged Sites' )
1034 && ( exists( $product->{comment} ) ) )
1035 { #IN case NCBI changes slightly the spacing:-)
1036 push @alluncaptured, $self->_process_STS( $product->{comment} );
1037 delete $product->{comment};
1038 last CLASS;
1040 if ( $heading =~ 'Pathways' ) {
1041 $self->{_current_heading} = 'Pathways';
1042 last CLASS;
1046 # my $tx2=new Benchmark;
1047 # my $td= timediff($tx2, $tx1);
1048 #print "\t\t$heading:",timestr($td),"\n";
1051 if ( exists( $product->{type} ) && ( $product->{type} eq 'generif' ) ) {
1052 push @alluncaptured, $self->_process_grif($product);
1053 return @alluncaptured; #Maybe still process the comments?
1055 if ( exists( $product->{refs} ) ) {
1056 $self->_add_references( $product->{refs}->{pmid} );
1057 delete $product->{refs}->{pmid};
1058 push @alluncaptured, $product;
1060 if ( exists( $product->{comment} ) ) {
1061 my ( $uncapt, $allan, $allfeat ) =
1062 $self->_process_comments( $product->{comment} );
1063 foreach my $key ( keys %$allan ) {
1064 foreach my $val ( @{ $allan->{$key} } ) {
1065 $self->{_ann}->add_Annotation( $key, $val );
1068 delete $product->{refs}->{comment};
1069 push @alluncaptured, $uncapt;
1072 #if (exists($product->{source})) {
1073 # my ($uncapt,$ann,$anchor)=_process_src($product->{source});
1074 # foreach my $dbl (@$ann) {
1075 # $self->{_ann}->add_Annotation('dblink',$dbl);
1078 return @alluncaptured;
1081 sub _process_STS {
1082 my $self = shift;
1083 my $comment = shift;
1084 my @comm;
1085 push @comm, ( ref($comment) eq 'ARRAY' ) ? @{$comment} : $comment;
1086 foreach my $product (@comm) {
1087 my $sts = Bio::Ontology::Term->new(
1088 -identifier => $product->{source}->{src}->{tag}->{id},
1089 -name => $product->{source}->{anchor},
1090 -comment => $product->{source}->{'post-text'}
1092 $sts->namespace( $product->{source}->{src}->{db} );
1093 $sts->authority('STS marker');
1094 my @alt;
1095 if ( exists( $product->{comment} ) ) {
1096 push @alt,
1097 ( ref( $product->{comment} ) eq 'ARRAY' )
1098 ? @{ $product->{comment} }
1099 : $product->{comment};
1100 foreach my $alt (@alt) {
1101 $sts->add_synonym( $alt->{text} );
1104 my $annterm = Bio::Annotation::OntologyTerm->new();
1105 $annterm->term($sts);
1106 $self->{_ann}->add_Annotation( 'OntologyTerm', $annterm );
1110 sub _process_go {
1111 my $self = shift;
1112 my $comm = shift;
1113 my @comm;
1114 push @comm, ( ref($comm) eq 'ARRAY' ) ? @{$comm} : $comm;
1115 foreach my $comp (@comm) {
1116 my $category = $comp->{label};
1117 if ( ref( $comp->{comment} ) eq 'ARRAY' ) {
1118 foreach my $go ( @{ $comp->{comment} } ) {
1119 my $term = _get_go_term( $go, $category );
1120 my $annterm = Bio::Annotation::OntologyTerm->new(
1121 -tagname => 'Gene Ontology' );
1122 $annterm->term($term);
1123 $self->{_ann}->add_Annotation( 'OntologyTerm', $annterm );
1126 else {
1127 my $term = _get_go_term( $comp->{comment}, $category );
1128 my $annterm =
1129 Bio::Annotation::OntologyTerm->new( -tagname => 'Gene Ontology' );
1130 $annterm->term($term);
1131 $self->{_ann}->add_Annotation( 'OntologyTerm', $annterm );
1136 sub _process_grif {
1137 my $self = shift;
1138 my $grif = shift;
1139 if ( ( exists( $grif->{comment} ) )
1140 && ( ref( $grif->{comment} ) eq 'ARRAY' ) )
1142 my @uncapt;
1143 foreach my $product ( @{ $grif->{comment} } ) {
1144 next unless ( exists( $product->{text} ) );
1145 my $uproduct = $self->_process_grif($product);
1147 #$self->{_ann->add_Annotation($type,$grifobj);
1148 push @uncapt, $uproduct;
1150 return \@uncapt;
1152 if ( exists( $grif->{comment}->{comment} ) ) {
1153 $grif = $grif->{comment};
1155 my $ref =
1156 ( ref( $grif->{refs} ) eq 'ARRAY' )
1157 ? shift @{ $grif->{refs} }
1158 : $grif->{refs};
1159 my $refergene = '';
1160 my $refdb = '';
1161 my ( $obj, $type );
1162 if ( $ref->{pmid} ) {
1163 if ( exists( $grif->{source} ) )
1164 { #unfortunatrely we cannot put yet everything in
1165 $refergene = $grif->{source}->{src}->{tag}->{id};
1166 $refdb = $grif->{source}->{src}->{db};
1168 my $grifobj = new Bio::Annotation::Comment( -text => $grif->{text} );
1169 $obj = Bio::Annotation::DBLink->new(
1170 -database => 'generif',
1171 -primary_id => $ref->{pmid}
1172 , #The pubmed id (at least the first one) which is a base for the conclusion
1173 -version => $grif->{version},
1174 -optional_id => $refergene,
1175 -authority => $refdb
1177 $obj->comment($grifobj);
1178 $type = 'dblink';
1180 else {
1181 $obj = new Bio::Annotation::SimpleValue( $grif->{text}, 'generif' );
1182 $type = 'generif';
1184 delete $grif->{text};
1185 delete $grif->{version};
1186 delete $grif->{type};
1187 delete $grif->{refs};
1188 $self->{_ann}->add_Annotation( $type, $obj );
1189 return $grif;
1192 sub _get_go_term {
1193 my $go = shift;
1194 my $category = shift;
1195 my $refan = Bio::Annotation::Reference->new( #We expect one ref per GO
1196 -medline => $go->{refs}->{pmid},
1197 -title => 'no title'
1199 my $term = Bio::Ontology::Term->new(
1200 -identifier => $go->{source}->{src}->{tag}->{id},
1201 -name => $go->{source}->{anchor},
1202 -definition => $go->{source}->{anchor},
1203 -comment => $go->{source}->{'post-text'},
1204 -version => $go->{version}
1206 $term->add_reference($refan);
1207 $term->namespace($category);
1208 return $term;
1211 sub _backcomp_ll {
1212 my $ann = shift;
1213 my $newann = Bio::Annotation::Collection->new();
1215 #$newann->{_annotation}->{ALIAS_SYMBOL}=$ann->{_annotation}->{ALIAS_SYMBOL};
1216 # $newann->{_annotation}->{CHR}=$ann->{_annotation}->{chromosome};
1217 # $newann->{_annotation}->{MAP}=$ann->{_annotation}->{cyto};
1218 foreach my $tagmap ( keys %{ $ann->{_typemap}->{_type} } ) {
1219 next if ( grep( /$tagmap/, @main::egonly ) );
1220 $newann->{_annotation}->{$tagmap} = $ann->{_annotation}->{$tagmap};
1223 #$newann->{_annotation}->{Reference}=$ann->{_annotation}->{Reference};
1224 #$newann->{_annotation}->{generif}=$ann->{_annotation}->{generif};
1225 #$newann->{_annotation}->{comment}=$ann->{_annotation}->{comment};
1226 # $newann->{_annotation}->{OFFICIAL_GENE_NAME}=$ann->{_annotation}->{'Official Full Name'};
1227 $newann->{_typemap}->{_type} = $ann->{_typemap}->{_type};
1228 foreach my $ftype ( keys %main::eg_to_ll ) {
1229 my $newkey = $main::eg_to_ll{$ftype};
1230 $newann->{_annotation}->{$newkey} = $ann->{_annotation}->{$ftype};
1231 $newann->{_typemap}->{_type}->{$newkey} =
1232 'Bio::Annotation::SimpleValue';
1233 delete $newann->{_typemap}->{_type}->{$ftype};
1234 $newann->{_annotation}->{$newkey}->[0]->{tagname} = $newkey;
1236 foreach my $dblink ( @{ $newann->{_annotation}->{dblink} } ) {
1237 next unless ( $dblink->{_url} );
1238 my $simann = Bio::Annotation::SimpleValue->new(
1239 -value => $dblink->{_url},
1240 -tagname => 'URL'
1242 $newann->add_Annotation($simann);
1245 # my $simann=Bio::Annotation::SimpleValue->new(-value=>$seq->desc,-tagname=>'comment');
1246 # $newann->add_Annotation($simann);
1248 return $newann;