maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / SeqIO / Handler / GenericRichSeqHandler.pm
blob733e4eaa212327354f02c271956d5a5368d09c95
2 # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17 data handler for GenBank/EMBL/UniProt (and other) sequence data
19 =head1 SYNOPSIS
21 # MyHandler is a GenericRichSeqHandler object.
22 # inside a parser (driver) constructor....
24 $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
26 # in next_seq() in driver...
28 $hobj = $self->seqhandler();
30 # roll data up into hashref chunks, pass off into Handler for processing...
32 $hobj->data_handler($data);
34 # or retrieve Handler methods and pass data directly to Handler methods...
36 my $hmeth = $hobj->handler_methods;
38 if ($hmeth->{ $data->{NAME} }) {
39 my $mth = $hmeth->{ $data->{NAME} };
40 $hobj->$mth($data);
43 =head1 DESCRIPTION
45 This is an experimental implementation of a sequence-based HandlerBaseI parser
46 and may change over time. It is possible (nay, likely) that the way handler
47 methods are set up will change over development to allow more flexibility.
48 Release pumpkins, please do not add this to a release until the API has settled.
49 It is also likely that write_seq() will not work properly for some data.
51 Standard Developer caveats:
53 Do not use for production purposes.
54 Not responsible for destroying (your data|computer|world).
55 Do not stare directly at GenericRichSeqHandler.
56 If GenericRichSeqHandler glows, back slowly away and call for help.
58 Consider yourself warned!
60 This class acts as a demonstration on how to handle similar data chunks derived
61 from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62 using similar (or the same) handler methods.
64 The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65 t/swiss.t yet all use the same handler methods (the collected tests for handlers
66 can be found in t/Handler.t). Some tweaking of the methods themselves is
67 probably in order over the long run to ensure that data is consistently handled
68 for each parser. Round-trip tests are probably in order here...
70 Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71 bypassing of data based on builder slots has been implemented (yet); this is
72 planned in the near future.
74 As a reminder: this is the current Annotation data chunk (via Data::Dumper):
76 $VAR1 = {
77 'NAME' => 'REFERENCE',
78 'DATA' => '1 (bases 1 to 10001)'
79 'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80 'TITLE' => 'The DNA sequence of Homo sapiens'
81 'JOURNAL' => 'Unpublished (2003)'
83 ...
85 This is the current SeqFeature data chunk (again via Data::Dumper):
87 $VAR1 = {
88 'mol_type' => 'genomic DNA',
89 'LOCATION' => '<1..>10001',
90 'NAME' => 'FEATURES',
91 'FEATURE_KEY' => 'source',
92 'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93 'db_xref' => 'taxon:9606',
94 'clone' => 'RP11-302I18',
95 'organism' => 'Homo sapiens'
98 =head1 FEEDBACK
100 =head2 Mailing Lists
102 User feedback is an integral part of the evolution of this and other
103 Bioperl modules. Send your comments and suggestions preferably to one
104 of the Bioperl mailing lists. Your participation is much appreciated.
106 bioperl-l@bioperl.org - General discussion
107 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
109 =head2 Support
111 Please direct usage questions or support issues to the mailing list:
113 I<bioperl-l@bioperl.org>
115 rather than to the module maintainer directly. Many experienced and
116 reponsive experts will be able look at the problem and quickly
117 address it. Please include a thorough description of the problem
118 with code and data examples if at all possible.
120 =head2 Reporting Bugs
122 Report bugs to the Bioperl bug tracking system to help us keep track
123 the bugs and their resolution. Bug reports can be submitted via the
124 web:
126 https://github.com/bioperl/bioperl-live/issues
128 =head1 AUTHOR - Chris Fields
130 Email cjfields at bioperl dot org
132 =head1 APPENDIX
134 The rest of the documentation details each of the object methods. Internal
135 methods are usually preceded with a _
137 =cut
139 # Let the code begin...
141 package Bio::SeqIO::Handler::GenericRichSeqHandler;
142 use strict;
143 use warnings;
145 use Bio::SeqIO::FTHelper;
146 use Bio::Annotation::Collection;
147 use Bio::Annotation::DBLink;
148 use Bio::Annotation::Comment;
149 use Bio::Annotation::Reference;
150 use Bio::Annotation::Collection;
151 use Bio::Annotation::SimpleValue;
152 use Bio::Annotation::TagTree;
153 use Bio::SeqFeature::Generic;
154 use Bio::Species;
155 use Bio::Taxon;
156 use Bio::DB::Taxonomy;
157 use Bio::Factory::FTLocationFactory;
158 use Data::Dumper;
160 use base qw(Bio::Root::Root Bio::HandlerBaseI);
162 my %HANDLERS = (
163 'genbank' => {
164 'LOCUS' => \&_genbank_locus,
165 'DEFINITION' => \&_generic_description,
166 'ACCESSION' => \&_generic_accession,
167 'VERSION' => \&_generic_version,
168 'KEYWORDS' => \&_generic_keywords,
169 'DBSOURCE' => \&_genbank_dbsource,
170 'DBLINK' => \&_genbank_dbsource,
171 'SOURCE' => \&_generic_species,
172 'REFERENCE' => \&_generic_reference,
173 'COMMENT' => \&_generic_comment,
174 'FEATURES' => \&_generic_seqfeatures,
175 'BASE' => \&noop, # this is generated from scratch
176 'ORIGIN' => \&_generic_seq,
177 # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178 '_DEFAULT_' => \&_generic_simplevalue,
180 'embl' => {
181 'ID' => \&_embl_id,
182 'DT' => \&_embl_date,
183 'DR' => \&_generic_dbsource,
184 'SV' => \&_generic_version,
185 'RN' => \&_generic_reference,
186 'KW' => \&_generic_keywords,
187 'DE' => \&_generic_description,
188 'AC' => \&_generic_accession,
189 #'AH' => \&noop, # TPA data not dealt with yet...
190 #'AS' => \&noop,
191 'SQ' => \&_generic_seq,
192 'OS' => \&_generic_species,
193 'CC' => \&_generic_comment,
194 'FT' => \&_generic_seqfeatures,
195 # handles anything else (WGS, TPA, ANN...)
196 '_DEFAULT_' => \&_generic_simplevalue,
198 'swiss' => {
199 'ID' => \&_swiss_id,
200 'DT' => \&_swiss_date,
201 'GN' => \&_swiss_genename,
202 'DR' => \&_generic_dbsource,
203 'RN' => \&_generic_reference,
204 'KW' => \&_generic_keywords,
205 'DE' => \&_generic_description,
206 'AC' => \&_generic_accession,
207 'SQ' => \&_generic_seq,
208 'OS' => \&_generic_species,
209 'CC' => \&_generic_comment,
210 'FT' => \&_generic_seqfeatures,
211 # handles anything else, though I don't know what...
212 '_DEFAULT_' => \&_generic_simplevalue,
216 # can we do this generically? Seems like a lot of trouble...
217 my %DBSOURCE = map {$_ => 1} qw(
218 EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219 TIGR GO InterPro Pfam PROSITE SGD GermOnline
220 HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221 H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222 MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223 UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224 PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225 Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226 TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227 PhotoList Gramene WormBase WormPep Genew ZFIN
228 PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229 swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
231 my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
233 our %VALID_ALPHABET = (
234 'bp' => 'dna',
235 'aa' => 'protein',
236 'rc' => '' # rc = release candidate; file has no sequences
239 =head2 new
241 Title : new
242 Usage :
243 Function:
244 Returns :
245 Args : -format Sequence format to be mapped for handler methods
246 -builder Bio::Seq::SeqBuilder object (normally defined in
247 SequenceStreamI object implementation constructor)
248 Throws : On undefined '-format' sequence format parameter
249 Note : Still under heavy development
251 =cut
253 sub new {
254 my ($class, @args) = @_;
255 my $self = $class->SUPER::new(@args);
256 $self = {@args};
257 bless $self,$class;
258 my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259 $self->throw("Must define sequence record format") if !$format;
260 $self->format($format);
261 $self->handler_methods();
262 $builder && $self->seqbuilder($builder);
263 $self->location_factory();
264 return $self;
267 =head1 L<Bio::HandlerBaseI> implementing methods
269 =head2 handler_methods
271 Title : handler_methods
272 Usage : $handler->handler_methods('GenBank')
273 %handlers = $handler->handler_methods();
274 Function: Retrieve the handler methods used for the current format() in
275 the handler. This assumes the handler methods are already
276 described in the HandlerI-implementing class.
277 Returns : a hash reference with the data type handled and the code ref
278 associated with it.
279 Args : [optional] String representing the sequence format. If set here
280 this will also set sequence_format()
281 Throws : On unimplemented sequence format in %HANDLERS
283 =cut
285 sub handler_methods {
286 my $self = shift;
287 if (!($self->{'handlers'})) {
288 $self->throw("No handlers defined for seqformat ",$self->format)
289 unless exists $HANDLERS{$self->format};
290 $self->{'handlers'} = $HANDLERS{$self->format};
292 return ($self->{'handlers'});
295 =head2 data_handler
297 Title : data_handler
298 Usage : $handler->data_handler($data)
299 Function: Centralized method which accepts all data chunks, then distributes
300 to the appropriate methods for processing based on the chunk name
301 from within the HandlerBaseI object.
303 One can also use
304 Returns : None
305 Args : an hash ref containing a data chunk.
307 =cut
309 sub data_handler {
310 my ($self, $data) = @_;
311 my $nm = $data->{NAME} || $self->throw("No name tag defined!");
313 # this should handle data on the fly w/o caching; any caching should be
314 # done in the driver!
315 my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316 (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
317 undef;
318 if (!$method) {
319 $self->debug("No handler defined for $nm\n");
320 return;
322 $self->$method($data);
325 =head2 reset_parameters
327 Title : reset_parameters
328 Usage : $handler->reset_parameters()
329 Function: Resets the internal cache of data (normally object parameters for
330 a builder or factory)
331 Returns : None
332 Args : None
334 =cut
336 sub reset_parameters {
337 my $self = shift;
338 $self->{'_params'} = undef;
341 =head2 format
343 Title : format
344 Usage : $handler->format('GenBank')
345 Function: Get/Set the format for the report/record being parsed. This can be
346 used to set handlers in classes which are capable of processing
347 similar data chunks from multiple driver modules.
348 Returns : String with the sequence format
349 Args : [optional] String with the sequence format
350 Note : The format may be used to set the handlers (as in the
351 current GenericRichSeqHandler implementation)
353 =cut
355 sub format {
356 my $self = shift;
357 return $self->{'_seqformat'} = lc shift if @_;
358 return $self->{'_seqformat'};
361 =head2 get_params
363 Title : get_params
364 Usage : $handler->get_params('-species')
365 Function: Convenience method used to retrieve the specified
366 parameters from the internal parameter cache
367 Returns : Hash ref containing parameters requested and data as
368 key-value pairs. Note that some parameter values may be
369 objects, arrays, etc.
370 Args : List (array) representing the parameters requested
372 =cut
374 sub get_params {
375 my ($self, @ids) = @_;
376 my %data;
377 for my $id (@ids) {
378 if (!index($id, '-')==0) {
379 $id = '-'.$id ;
381 $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
383 return \%data;
386 =head2 set_params
388 Title : set_params
389 Usage : $handler->set_param({'-species')
390 Function: Convenience method used to set specific parameters
391 Returns : None
392 Args : Hash ref containing the data to be passed as key-value pairs
394 =cut
396 sub set_params {
397 shift->throw('Not implemented yet!');
400 =head1 Methods unique to this implementation
402 =head2 seqbuilder
404 Title : seqbuilder
405 Usage :
406 Function:
407 Returns :
408 Args :
409 Throws :
410 Note :
412 =cut
414 sub seqbuilder {
415 my $self = shift;
416 return $self->{'_seqbuilder'} = shift if @_;
417 return $self->{'_seqbuilder'};
420 =head2 build_sequence
422 Title : build_sequence
423 Usage :
424 Function:
425 Returns :
426 Args :
427 Throws :
428 Note :
430 =cut
432 sub build_sequence {
433 my $self = shift;
434 my $builder = $self->seqbuilder();
435 my $seq;
436 if (defined($self->{'_params'})) {
437 $builder->add_slot_value(%{ $self->{'_params'} });
438 $seq = $builder->make_object();
439 $self->reset_parameters;
441 return $seq if $seq;
442 return 0;
445 =head2 location_factory
447 Title : location_factory
448 Usage :
449 Function:
450 Returns :
451 Args :
452 Throws :
453 Note :
455 =cut
457 sub location_factory {
458 my ($self, $factory) = @_;
459 if ($factory) {
460 $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461 "explicitly setting factory()") unless
462 (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463 $self->{'_locfactory'} = $factory;
464 } elsif (!defined($self->{'_locfactory'})) {
465 $self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
467 return $self->{'_locfactory'};
470 =head2 annotation_collection
472 Title : annotation_collection
473 Usage :
474 Function:
475 Returns :
476 Args :
477 Throws :
478 Note :
480 =cut
482 sub annotation_collection {
483 my ($self, $coll) = @_;
484 if ($coll) {
485 $self->throw("Must have Bio::AnnotationCollectionI ".
486 "when explicitly setting collection()")
487 unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488 $self->{'_params'}->{'-annotation'} = $coll;
489 } elsif (!exists($self->{'_params'}->{'-annotation'})) {
490 $self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
492 return $self->{'_params'}->{'-annotation'};
495 ####################### SEQUENCE HANDLERS #######################
497 # any sequence data
498 sub _generic_seq {
499 my ($self, $data) = @_;
500 $self->{'_params'}->{'-seq'} = $data->{DATA};
503 ####################### RAW DATA HANDLERS #######################
505 # GenBank LOCUS line
506 sub _genbank_locus {
507 my ($self, $data) = @_;
508 my (@tokens) = split m{\s+}, $data->{DATA};
509 my $display_id = shift @tokens;
510 $self->{'_params'}->{'-display_id'} = $display_id;
511 my $seqlength = shift @tokens;
512 if (exists $VALID_ALPHABET{$seqlength}) {
513 # moved one token too far. No locus name?
514 $self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515 "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516 $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517 $self->{'_params'}->{'-display_id'} = 'unknown';
518 # add token back...
519 unshift @tokens, $seqlength;
520 } else {
521 $self->{'_params'}->{'-length'} = $seqlength;
523 my $alphabet = lc(shift @tokens);
524 $self->{'_params'}->{'-alphabet'} =
525 (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526 $self->warn("Unknown alphabet: $alphabet");
527 if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
528 $self->{'_params'}->{'-molecule'} = shift(@tokens);
529 my $circ = shift(@tokens);
530 if ($circ eq 'circular') {
531 $self->{'_params'}->{'-is_circular'} = 1;
532 $self->{'_params'}->{'-division'} = shift(@tokens);
533 } else {
534 # 'linear' or 'circular' may actually be omitted altogether
535 $self->{'_params'}->{'-division'} =
536 (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
538 } else {
539 $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540 $self->{'_params'}->{'-division'} = shift(@tokens);
542 my $date = join(' ', @tokens);
543 # maybe use Date::Time for dates?
544 if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
546 if( length($date) < 11 ) {
547 # improperly formatted date
548 # But we'll be nice and fix it for them
549 my ($d,$m,$y) = ($2,$3,$4);
550 if( length($d) == 1 ) {
551 $d = "0$d";
553 # guess the century here
554 if( length($y) == 2 ) {
555 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556 $y = "19$y";
557 } else {
558 $y = "20$y";
560 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
562 $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563 } else {
564 $self->{'_params'}->{'-dates'} = [$date];
569 # EMBL ID line
570 sub _embl_id {
571 my ($self, $data) = @_;
572 my $alphabet;
573 my ($name, $sv, $topology, $mol, $div);
574 my $line = $data->{DATA};
575 #$self->debug("$line\n");
576 my ($idtype) = $line =~ tr/;/;/;
577 if ( $idtype == 6) { # New style headers contain exactly six semicolons.
578 # New style header (EMBL Release >= 87, after June 2006)
579 my $topology;
580 my $sv;
582 # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583 # This regexp comes from the new2old.pl conversion script, from EBI
584 if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586 } else {
587 $self->throw("Unrecognized EMBL ID line:[$line]");
589 if (defined($sv)) {
590 $self->{'_params'}->{'-seq_version'} = $sv;
591 $self->{'_params'}->{'-version'} = $sv;
594 if ($topology eq "circular") {
595 $self->{'_params'}->{'-is_circular'} = 1;
598 if (defined $mol ) {
599 if ($mol =~ /DNA/) {
600 $alphabet='dna';
602 elsif ($mol =~ /RNA/) {
603 $alphabet='rna';
605 elsif ($mol =~ /AA/) {
606 $alphabet='protein';
609 } elsif ($idtype) { # has internal ';'
610 # Old style header (EMBL Release < 87, before June 2006)
611 if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
612 ($name, $mol, $div) = ($1, $2, $3);
613 #$self->debug("[$name][$mol][$div]");
616 if($mol) {
617 if ( $mol =~ m{circular} ) {
618 $self->{'_params'}->{'-is_circular'} = 1;
619 $mol =~ s{circular }{};
621 if (defined $mol ) {
622 if ($mol =~ /DNA/) {
623 $alphabet='dna';
625 elsif ($mol =~ /RNA/) {
626 $alphabet='rna';
628 elsif ($mol =~ /AA/) {
629 $alphabet='protein';
633 } else {
634 $name = $data->{DATA};
636 unless( defined $name && length($name) ) {
637 $name = "unknown_id";
639 $self->{'_params'}->{'-display_id'} = $name;
640 $self->{'_params'}->{'-alphabet'} = $alphabet;
641 $self->{'_params'}->{'-division'} = $div if $div;
642 $self->{'_params'}->{'-molecule'} = $mol if $mol;
645 # UniProt/SwissProt ID line
646 sub _swiss_id {
647 my ($self, $data) = @_;
648 my ($name, $seq_div);
649 if($data->{DATA} =~ m{^
650 (\S+) \s+ # $1 entryname
651 ([^\s;]+); \s+ # $2 DataClass
652 (?:PRT;)? \s+ # Molecule Type (optional)
653 [0-9]+[ ]AA \. # Sequencelength (capture?)
655 }ox ) {
656 ($name, $seq_div) = ($1, $2);
657 $self->{'_params'}->{'-namespace'} =
658 ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659 ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660 $seq_div;
661 # we shouldn't be setting the division, but for now...
662 my ($junk, $division) = split q(_), $name;
663 $self->{'_params'}->{'-division'} = $division;
664 $self->{'_params'}->{'-alphabet'} = 'protein';
665 # this is important to have the id for display in e.g. FTHelper, otherwise
666 # you won't know which entry caused an error
667 $self->{'_params'}->{'-display_id'} = $name;
668 } else {
669 $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
673 # UniProt/SwissProt GN line
674 sub _swiss_genename {
675 my ($self, $data) = @_;
676 #$self->debug(Dumper($data));
677 my $genename = $data->{DATA};
678 my $gn;
679 if ($genename) {
680 my @stags;
681 if ($genename =~ /\w=\w/) {
682 # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683 for my $n (split(m{\s+and\s+},$genename)) {
684 my @genenames;
685 for my $section (split(m{\s*;\s*},$n)) {
686 my ($tag, $rest) = split("=",$section);
687 $rest ||= '';
688 for my $val (split(m{\s*,\s*},$rest)) {
689 push @genenames, [$tag => $val];
692 push @stags, ['gene_name' => \@genenames];
694 } else {
695 # old format
696 for my $section (split(/ AND /, $genename)) {
697 my @genenames;
698 $section =~ s/[\(\)\.]//g;
699 my @names = split(m{\s+OR\s+}, $section);
700 push @genenames, ['Name' => shift @names];
701 push @genenames, map {['Synonyms' => $_]} @names;
702 push @stags, ['gene_name' => \@genenames]
704 } #use Data::Dumper; print Dumper $gn, $genename;# exit;
705 my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706 -value => ['gene_names' => \@stags]);
707 $self->annotation_collection->add_Annotation('gene_name', $gn);
711 # GenBank VERSION line
712 # old EMBL SV line (now obsolete)
713 # UniProt/SwissProt?
714 sub _generic_version {
715 my ($self, $data) = @_;
716 my ($acc,$gi) = split(' ',$data->{DATA});
717 if($acc =~ m{^\w+\.(\d+)}xmso) {
718 $self->{'_params'}->{'-version'} = $1;
719 $self->{'_params'}->{'-seq_version'} = $1;
721 if($gi && (index($gi,"GI:") == 0)) {
722 $self->{'_params'}->{'-primary_id'} = substr($gi,3);
726 # EMBL DT lines
727 sub _embl_date {
728 my ($self, $data) = @_;
729 while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
730 my ($date, $version) = ($1, $2);
731 $date =~ tr{,}{}d; # remove comma if new version
732 if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
733 my $release = Bio::Annotation::SimpleValue->new(
734 -tagname => 'creation_release',
735 -value => $1
737 $self->annotation_collection->add_Annotation($release);
738 } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
739 my $release = Bio::Annotation::SimpleValue->new(
740 -tagname => 'update_release',
741 -value => $1
743 $self->annotation_collection->add_Annotation($release);
744 my $update = Bio::Annotation::SimpleValue->new(
745 -tagname => 'update_version',
746 -value => $2
748 $self->annotation_collection->add_Annotation($update);
750 push @{ $self->{'_params'}->{'-dates'} }, $date;
754 # UniProt/SwissProt DT lines
755 sub _swiss_date {
756 my ($self, $data) = @_;
757 # swissprot
758 my @dls = split m{\n}, $data->{DATA};
759 for my $dl (@dls) {
760 my ($date, $version) = split(' ', $dl, 2);
761 $date =~ tr{,}{}d; # remove comma if new version
762 if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
763 $version =~ m{sequence version (\d+)\.}) { #new
764 my $update = Bio::Annotation::SimpleValue->new(
765 -tagname => 'seq_update',
766 -value => $1
768 $self->annotation_collection->add_Annotation($update);
769 } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
770 $version =~ m{entry version (\d+)\.}) { #new
771 $self->{'_params'}->{'-version'} = $1;
772 $self->{'_params'}->{'-seq_version'} = $1;
774 push @{ $self->{'_params'}->{'-dates'} }, $date;
778 # GenBank KEYWORDS line
779 # EMBL KW line
780 # UniProt/SwissProt KW line
781 sub _generic_keywords {
782 my ($self, $data) = @_;
783 $data->{DATA} =~ s{\.$}{};
784 my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
785 $self->{'_params'}->{'-keywords'} = \@kw;
788 # GenBank DEFINITION line
789 # EMBL DE line
790 # UniProt/SwissProt DE line
791 sub _generic_description {
792 my ($self, $data) = @_;
793 $self->{'_params'}->{'-desc'} = $data->{DATA};
796 # GenBank ACCESSION line
797 # EMBL AC line
798 # UniProt/SwissProt AC line
799 sub _generic_accession {
800 my ($self, $data) = @_;
801 my @accs = split m{[\s;]+}, $data->{DATA};
802 $self->{'_params'}->{'-accession_number'} = shift @accs;
803 $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
806 ####################### SPECIES HANDLERS #######################
808 # uses Bio::Species
809 # GenBank SOURCE, ORGANISM lines
810 # EMBL O* lines
811 # UniProt/SwissProt O* lines
812 sub _generic_species {
813 my ($self, $data) = @_;
815 my $seqformat = $self->format;
816 # if data is coming in from GenBank parser...
817 if ($seqformat eq 'genbank' &&
818 $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819 ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
822 # SwissProt stuff...
823 # hybrid names in swissprot files are no longer valid per intergration into
824 # UniProt. Files containing these have been split into separate entries, so
825 # it is probably a good idea to update if one has these lingering around...
827 my $taxid;
828 if ($seqformat eq 'swiss') {
829 if ($data->{DATA} =~ m{^([^,]+)}ox) {
830 $data->{DATA} = $1;
832 if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833 $taxid = $1;
837 my ($sl, $class, $sci_name) = ($data->{DATA},
838 $data->{CLASSIFICATION},
839 $data->{ORGANISM} || '');
840 my ($organelle,$abbr_name, $common);
841 my @class = reverse split m{\s*;\s*}, $class;
842 # have to treat swiss different from everything else...
843 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
844 \s*(.*?)
845 \s*(?: \( (.*?) \) )?\.?$
846 }xmso ){
847 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848 } else {
849 $abbr_name = $sl; # nothing caught; this is a backup!
851 # there is no 'abbreviated name' for EMBL
852 $sci_name = $abbr_name if $seqformat ne 'genbank';
853 $organelle ||= '';
854 $common ||= '';
855 $sci_name || return;
856 unshift @class, $sci_name;
857 # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858 my $make = Bio::Species->new();
859 $make->scientific_name($sci_name);
860 $make->classification(@class) if @class > 0;
861 $common && $make->common_name( $common );
862 $abbr_name && $make->name('abbreviated', $abbr_name);
863 $organelle && $make->organelle($organelle);
864 $taxid && $make->ncbi_taxid($taxid);
865 $self->{'_params'}->{'-species'} = $make;
868 ####################### ANNOTATION HANDLERS #######################
870 # GenBank DBSOURCE line
871 sub _genbank_dbsource {
872 my ($self, $data) = @_;
873 my $dbsource = $data->{DATA};
874 my $annotation = $self->annotation_collection;
875 # deal with swissprot dbsources
876 # we could possibly parcel these out to subhandlers...
877 if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878 $annotation->add_Annotation
879 ('dblink',
880 Bio::Annotation::DBLink->new
881 (-primary_id => $2,
882 -database => $1,
883 -tagname => 'dblink'));
884 if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885 $annotation->add_Annotation
886 ('swissprot_dates',
887 Bio::Annotation::SimpleValue->new
888 (-tagname => 'date_created',
889 -value => $1));
891 while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892 $annotation->add_Annotation
893 ('swissprot_dates',
894 Bio::Annotation::SimpleValue->new
895 (-tagname => 'date_updated',
896 -value => $1));
898 $dbsource =~ s/\n/ /g;
899 if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
900 # will use $i to determine even or odd
901 # for swissprot the accessions are paired
902 my $i = 0;
903 for my $dbsrc ( split(/,\s+/,$1) ) {
904 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
905 my ($id,$version) = ($1,$2);
906 $version ='' unless defined $version;
907 my $db;
908 if( $id =~ /^\d\S{3}/) {
909 $db = 'PDB';
910 } else {
911 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
913 $annotation->add_Annotation
914 ('dblink',
915 Bio::Annotation::DBLink->new
916 (-primary_id => $id,
917 -version => $version,
918 -database => $db,
919 -tagname => 'dblink'));
922 } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923 # download screwed up and ncbi didn't put acc in for gi numbers
924 my $i = 0;
925 for my $id ( split(/\,\s+/,$1) ) {
926 my ($acc,$db);
927 if( $id =~ /gi:\s+(\d+)/ ) {
928 $acc= $1;
929 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930 } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931 $acc= $1;
932 $db = 'PDB';
933 } else {
934 $acc= $id;
935 $db = '';
937 $annotation->add_Annotation
938 ('dblink',
939 Bio::Annotation::DBLink->new
940 (-primary_id => $acc,
941 -database => $db,
942 -tagname => 'dblink'));
944 } else {
945 $self->warn("Cannot match $dbsource\n");
947 if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
948 ((?:\S+,\s+)+\S+)//x ) {
949 for my $id ( split(/\,\s+/,$1) ) {
950 my $db;
951 # this is because GenBank dropped the spaces!!!
952 # I'm sure we're not going to get this right
953 ##if( $id =~ s/^://i ) {
954 ## $db = $1;
956 $db = substr($id,0,index($id,':'));
957 if (! exists $DBSOURCE{ $db }) {
958 $db = ''; # do we want 'GenBank' here?
960 $id = substr($id,index($id,':')+1);
961 $annotation->add_Annotation
962 ('dblink',Bio::Annotation::DBLink->new
963 (-primary_id => $id,
964 -database => $db,
965 -tagname => 'dblink'));
968 } else {
969 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
970 my ($db,$id,$version) = ($1,$2,$3);
971 $annotation->add_Annotation
972 ('dblink',
973 Bio::Annotation::DBLink->new
974 (-primary_id => $id,
975 -version => $version,
976 -database => $db || 'GenBank',
977 -tagname => 'dblink'));
978 } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979 my ($id, $db, $version);
980 if ($2 eq ':') {
981 ($db, $id) = ($1, $3);
982 } else {
983 ($db, $id, $version) = ('GenBank', $1, $3);
985 $annotation->add_Annotation('dblink',
986 Bio::Annotation::DBLink->new(
987 -primary_id => $id,
988 -version => $version,
989 -database => $db,
990 -tagname => 'dblink')
992 } else {
993 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
998 # EMBL DR lines
999 # UniProt/SwissProt DR lines
1000 sub _generic_dbsource {
1001 my ($self, $data) = @_;
1002 #$self->debug(Dumper($data));
1003 while ($data->{DATA} =~ m{([^\n]+)}og) {
1004 my $dblink = $1;
1005 $dblink =~ s{\.$}{};
1006 my $link;
1007 my @linkdata = split '; ',$dblink;
1008 if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009 #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011 $link = Bio::Annotation::DBLink->new(-database => $databse,
1012 -primary_id => $prim_id,
1013 -optional_id => $sec_id);
1014 } else {
1015 $self->warn("No match for $dblink");
1017 $self->annotation_collection->add_Annotation('dblink', $link);
1022 # GenBank REFERENCE and related lines
1023 # EMBL R* lines
1024 # UniProt/SwissProt R* lines
1025 sub _generic_reference {
1026 my ($self, $data) = @_;
1027 my $seqformat = $self->format;
1028 my ($start, $end);
1029 # get these in EMBL/Swiss
1030 if ($data->{CROSSREF}) {
1031 while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032 my ($db, $ref) = (uc $1, $2);
1033 $ref =~ s{[;.]+$}{};
1034 $data->{$db} = $ref;
1037 # run some cleanup for swissprot
1038 if ($seqformat eq 'swiss') {
1039 for my $val (values %{ $data }) {
1040 $val =~ s{;$}{};
1041 $val =~ s{(\w-)\s}{$1};
1044 if ( $data->{POSITION} ) {
1045 if ($seqformat eq 'embl') {
1046 ($start, $end) = split '-', $data->{POSITION},2;
1047 } elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048 ($start, $end) = ($1, $2);
1051 if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052 ($start, $end) = ($1, $2);
1054 my $ref = Bio::Annotation::Reference->new(
1055 -comment => $data->{REMARK},
1056 -location => $data->{JOURNAL},
1057 -pubmed => $data->{PUBMED},
1058 -consortium => $data->{CONSRTM},
1059 -title => $data->{TITLE},
1060 -authors => $data->{AUTHORS},
1061 -medline => $data->{MEDLINE},
1062 -doi => $data->{DOI},
1063 -rp => $data->{POSITION}, # JIC...
1064 -start => $start,
1065 -end => $end,
1067 if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) {
1068 $ref->gb_reference($1);
1070 $self->annotation_collection->add_Annotation('reference', $ref);
1073 # GenBank COMMENT lines
1074 # EMBL CC lines
1075 # UniProt/SwissProt CC lines
1076 sub _generic_comment {
1077 my ($self, $data) = @_;
1078 $self->annotation_collection->add_Annotation('comment',
1079 Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1082 ####################### SEQFEATURE HANDLER #######################
1084 # GenBank Feature Table
1085 sub _generic_seqfeatures {
1086 my ($self, $data) = @_;
1087 return if $data->{FEATURE_KEY} eq 'FEATURES';
1088 my $primary_tag = $data->{FEATURE_KEY};
1090 # grab the NCBI taxon ID from the source SF
1091 if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092 if ( $self->{'_params'}->{'-species'} &&
1093 $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094 $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1097 my $source = $self->format;
1099 my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
1101 my $loc;
1102 eval {
1103 $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1105 if(! $loc) {
1106 $self->warn("exception while parsing location line [" .
1107 $data->{'LOCATION'} .
1108 "] in reading $source, ignoring feature " .
1109 $data->{'primary_tag'}.
1110 " (seqid=" . $seqid . "): " . $@);
1111 return;
1113 if($seqid && (! $loc->is_remote())) {
1114 $loc->seq_id($seqid); # propagates if it is a split location
1116 my $sf = Bio::SeqFeature::Generic->direct_new();
1117 $sf->location($loc);
1118 $sf->primary_tag($primary_tag);
1119 $sf->seq_id($seqid);
1120 $sf->source_tag($source);
1121 delete $data->{'FEATURE_KEY'};
1122 delete $data->{'LOCATION'};
1123 delete $data->{'NAME'};
1124 delete $data->{'DATA'};
1125 $sf->set_attributes(-tag => $data);
1126 push @{ $self->{'_params'}->{'-features'} }, $sf;
1129 ####################### ODDS AND ENDS #######################
1131 # Those things that don't fit anywhere else. If a specific name
1132 # maps to the below table, that class and method are used, otherwise
1133 # it goes into a SimpleValue (I think this is a good argument for why
1134 # we need a generic mechanism for storing annotation)
1136 sub _generic_simplevalue {
1137 my ($self, $data) = @_;
1138 $self->annotation_collection->add_Annotation(
1139 Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140 -value => $data->{DATA})
1144 sub noop {}
1146 sub _debug {
1147 my ($self, $data) = @_;
1148 $self->debug(Dumper($data));