changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / SeqIO / gbxml.pm
blob60c0bafaace9aec63b78a25e2a5808638f7a6dd3
1 # $Id: gbxml.pm
2 #
3 # BioPerl module for Bio::SeqIO::gbxml
5 # Cared for by Ryan Golhar
6 # NOTE: This module is implemented on an as needed basis. As features
7 # are needed, they are implemented. Its very bare-bones.
9 # Based off http://www.insdc.org/page.php?page=documents&sid=105a8b52b69db9c36c82a2e0d923ca69
11 # I tried to follow the genbank module to keep things as consistent as possible
12 # Right now, I'm not respecting the want_slot parameters. This will need to be added.
14 =head1 NAME
16 Bio::SeqIO::gbxml - GenBank sequence input/output stream using SAX
18 =head1 SYNOPSIS
20 It is probably best not to use this object directly, but rather go
21 through the SeqIO handler system. To read a GenBank XML file:
23 $stream = Bio::SeqIO->new( -file => $filename, -format => 'gbxml');
25 while ( my $bioSeqObj = $stream->next_seq() ) {
26 # do something with $bioSeqObj
29 To write a Seq object to the current file handle in GenBank XML format:
31 $stream->write_seq( -seq => $seqObj);
33 If instead you would like a XML::DOM object containing the GBXML, use:
35 my $newXmlObject = $stream->to_bsml( -seq => $seqObj);
37 =head1 DEPENDENCIES
39 In addition to parts of the Bio:: hierarchy, this module uses:
41 XML::SAX
43 =head1 DESCRIPTION
45 This object can transform Bio::Seq objects to and from GenBank XML
46 flatfiles.
48 =head1 FEEDBACK
50 =head2 Mailing Lists
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to one
54 of the Bioperl mailing lists. Your participation is much appreciated.
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
59 =head2 Reporting Bugs
61 Report bugs to the Bioperl bug tracking system to help us keep track
62 the bugs and their resolution. Bug reports can be submitted via the
63 web:
65 https://github.com/bioperl/bioperl-live/issues
67 =head1 AUTHOR - Ryan Golhar
69 Email golharam-at-umdnj-dot-edu
71 =cut
73 package Bio::SeqIO::gbxml;
74 use vars qw($Default_Source);
75 use strict;
77 use Bio::SeqIO::FTHelper;
78 use Bio::SeqFeature::Generic;
79 use Bio::Species;
80 use XML::SAX;
81 use Bio::Seq::SeqFactory;
82 use Bio::Annotation::Collection;
83 use Bio::Annotation::Comment;
84 use Bio::Annotation::Reference;
85 use Bio::Annotation::DBLink;
87 use base qw(Bio::SeqIO XML::SAX::Base);
89 $Default_Source = 'GBXML';
91 sub _initialize {
92 my ($self) = shift;
93 $self->SUPER::_initialize(@_);
94 $self->{'_parser'} = XML::SAX::ParserFactory->parser('Handler' => $self);
95 if( ! defined $self->sequence_factory ) {
96 $self->sequence_factory(Bio::Seq::SeqFactory->new
97 (-verbose => $self->verbose(),
98 -type => 'Bio::Seq::RichSeq'));
100 return;
103 =head1 METHODS
105 =cut
107 =head2 next_seq
109 Title : next_seq
110 Usage : my $bioSeqObj = $stream->next_seq
111 Function: Retrieves the next sequence from a SeqIO::gbxml stream.
112 Returns : A reference to a Bio::Seq::RichSeq object
113 Args :
115 =cut
117 sub next_seq {
118 my $self = shift;
119 if( @{$self->{'_seendata'}->{'_seqs'} || []} || eof($self->_fh)) {
120 return shift @{$self->{'_seendata'}->{'_seqs'}};
122 $self->{'_parser'}->parse_file($self->_fh);
123 return shift @{$self->{'_seendata'}->{'_seqs'}};
126 # XML::SAX::Base methods
128 sub start_document {
129 my ($self,$doc) = @_;
130 $self->{'_seendata'} = {'_seqs' => [] #,
131 # '_authors' => [],
132 # '_feats' => []
134 $self->SUPER::start_document($doc);
137 sub end_document {
138 my ($self,$doc) = @_;
139 $self->SUPER::end_document($doc);
143 sub start_element {
144 my ($self,$ele) = @_;
145 my $name = uc($ele->{'LocalName'});
147 # my $attr = $ele->{'Attributes'};
148 # my $seqid = defined $self->{'_seendata'}->{'_seqs'}->[-1] ?
149 # $self->{'_seendata'}->{'_seqs'}->[-1]->display_id : undef;
151 # for my $k ( keys %$attr ) {
152 # $attr->{uc $k} = $attr->{$k};
153 # delete $attr->{$k};
154 # }
156 if( $name eq 'GBSET' ) {
158 } elsif( $name eq 'GBSEQ' ) {
159 # Initialize, we are starting a new sequence.
160 push @{$self->{'_seendata'}->{'_seqs'}},
161 $self->sequence_factory->create();
162 } elsif( $name eq 'GBFEATURE' ) {
163 my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
164 my $fthelper = Bio::SeqIO::FTHelper->new();
165 $fthelper->verbose($self->verbose());
166 push @{$self->{'_seendata'}->{'_feats'}}, $fthelper;
169 # } elsif( $name eq 'FEATURE-TABLES' ) {
170 # } elsif( $name eq 'database-xref' ) {
171 # my ($db,$id) = split(/:/,$content);
172 # $curseq->annotation->add_Annotation('dblink',
173 # Bio::Annotation::DBLink->new
174 # ( -database => $db,
175 # -primary_id=> $id));
176 # } elsif( $name eq 'INTERVAL-LOC' ) {
177 # my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
178 # my ($start,$end,$strand) =
179 # map { $attr->{'{}'.$_}->{'Value'} } qw(STARTPOS
180 # ENDPOS
181 # COMPLEMENT);
183 # $curfeat->start($start);
184 # $curfeat->end($end);
185 # $curfeat->strand(-1) if($strand);
186 # } elsif( $name eq 'REFERENCE' ) {
187 # push @{$self->{'_seendata'}->{'_annot'}},
188 # Bio::Annotation::Reference->new();
190 $self->{'_characters'} = '';
192 push @{$self->{'_state'}}, $name;
193 $self->SUPER::start_element($ele);
196 sub end_element {
197 my ($self,$ele) = @_;
198 pop @{$self->{'_state'}};
199 my $name = uc $ele->{'LocalName'};
200 my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
201 my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
203 if ($name eq 'GBSEQ_LOCUS') {
204 $curseq->display_id($self->{'_characters'});
206 } elsif ($name eq 'GBSEQ_LENGTH' ) {
207 $curseq->length($self->{'_characters'});
209 } elsif ($name eq 'GBSEQ_MOLTYPE' ) {
210 if ($self->{'_characters'} =~ /mRNA|dna/) {
211 $curseq->alphabet('dna');
212 } else {
213 $curseq->alphabet('protein');
215 $curseq->molecule($self->{'_characters'});
217 } elsif ($name eq 'GBSEQ_TOPOLOGY' ) {
218 $curseq->is_circular(($self->{'_characters'} =~ /^linear$/i) ? 0 : 1);
220 } elsif ($name eq 'GBSEQ_DIVISION' ) {
221 $curseq->division($self->{'_characters'});
223 } elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) {
224 my $date = $self->{'_characters'};
225 # This code was taken from genbank.pm
226 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
227 if( length($date) < 11 ) { # improperly formatted date
228 # But we'll be nice and fix it for them
229 my ($d,$m,$y) = ($2,$3,$4);
230 $d = "0$d" if( length($d) == 1 );
231 # guess the century here
232 if( length($y) == 2 ) {
233 # arbitrarily guess that '60' means 1960
234 $y = ($y > 60) ? "19$y" : "20$y";
235 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
237 $date = [join('-',$d,$m,$y)];
239 $curseq->add_date($date);
242 } elsif ($name eq 'GBSEQ_DEFINITION' ) {
243 $curseq->description($self->{'_characters'});
245 } elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) {
246 $curseq->accession_number($self->{'_characters'});
248 } elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) {
249 # also taken from genbank.pm
250 $self->{'_characters'} =~ m/^\w+\.(\d+)/;
251 if ($1) {
252 $curseq->version($1);
253 $curseq->seq_version($1);
256 } elsif ($name eq 'GBSEQID' ) {
257 if ($self->{'_characters'} =~ m/gi\|(\d+)/) {
258 $curseq->primary_id($1);
261 } elsif ($name eq 'GBSEQ_SOURCE') {
262 $self->{'_taxa'}->{'_common'} = $self->{'_characters'};
264 } elsif ($name eq 'GBSEQ_ORGANISM' ) {
265 # taken from genbank.pm
266 my @organell_names = ("chloroplast", "mitochondr");
267 my @spflds = split(' ', $self->{'_characters'});
269 $_ = $self->{'_characters'};
270 if (grep { $_ =~ /^$spflds[0]/i; } @organell_names) {
271 $self->{'_taxa'}->{'_organelle'} = shift(@spflds);
273 $self->{'_taxa'}->{'_genus'} = shift(@spflds);
274 $self->{'_taxa'}->{'_species'} = shift(@spflds) if (@spflds);
275 $self->{'_taxa'}->{'_sub_species'} = shift(@spflds) if (@spflds);
276 $self->{'_taxa'}->{'_ns_name'} = $self->{'_characters'};
278 } elsif ($name eq 'GBSEQ_TAXONOMY' ) {
279 # taken from genbank.pm
280 $_ = $self->{'_characters'};
281 my @class;
282 push (@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $_);
284 next unless $self->{'_taxa'}->{'_genus'} and $self->{'_taxa'}->{'_genus'} !~ /^(unknown|None)$/oi;
285 if ($class[0] eq 'Viruses') {
286 push( @class, $self->{'_taxa'}->{'_ns_name'} );
288 elsif ($class[$#class] eq $self->{'_taxa'}->{'_genus'}) {
289 push( @class, $self->{'_taxa'}->{'_species'} );
290 } else {
291 push( @class, $self->{'_taxa'}->{'_genus'}, $self->{'_taxa'}->{'_species'} );
293 @class = reverse @class;
295 my $make = Bio::Species->new();
296 $make->classification( \@class, "FORCE");
297 $make->common_name($self->{'_taxa'}->{'_common'}) if $self->{'_taxa'}->{'_common'};
298 unless ($class[-1] eq 'Viruses') {
299 $make->sub_species( $self->{'_taxa'}->{'_sub_species'} ) if $self->{'_taxa'}->{'_sub_species'};
301 $make->organelle( $self->{'_taxa'}->{'_organelle'} ) if $self->{'_taxa'}->{'_organelle'};
302 $curseq->species($make);
303 delete $self->{'_taxa'};
305 } elsif( $name eq 'GBSEQ_COMMENT' ) {
306 $curseq->annotation->add_Annotation('comment', Bio::Annotation::Comment->new(-text => $self->{'_characters'} )) if ($self->{'_characters'});
308 } elsif ($name eq 'GBFEATURE_KEY' ) {
309 $curfeat->key($self->{'_characters'});
311 } elsif ($name eq 'GBFEATURE_LOCATION' ) {
312 $curfeat->loc($self->{'_characters'});
314 } elsif ($name eq 'GBQUALIFIER_NAME' ) {
315 $self->{'_feature'}->{"_qualifer_name"} = $self->{'_characters'};
317 } elsif ($name eq 'GBQUALIFIER_VALUE' ) {
318 my $qualifier = $self->{'_feature'}->{"_qualifer_name"};
319 delete $self->{'_feature'}->{"_qualifer_name"};
321 $curfeat->field->{$qualifier} ||= [];
322 push(@{$curfeat->field->{$qualifier}}, $self->{'_characters'});
324 } elsif ($name eq 'GBSEQ_SEQUENCE' ) {
325 $curseq->seq($self->{'_characters'});
327 } elsif( $name eq 'GBFEATURE' ) {
328 shift @{$self->{'_seendata'}->{'_feats'}};
329 # copied from genbank.pm
330 if (!defined($curfeat)) {
331 $self->warn("Unexpected error in feature table for ".$curseq->display_id." Skipping feature, attempting to recover");
332 } else {
333 my $feat = $curfeat->_generic_seqfeature($self->location_factory(), $curseq->display_id);
334 if ($curseq->species && ($feat->primary_tag eq 'source') &&
335 $feat->has_tag('db_xref') && (! $curseq->species->ncbi_taxid())) {
336 foreach my $tagval ($feat->get_tag_values('db_xref')) {
337 if (index($tagval,"taxon:") == 0) {
338 $curseq->species->ncbi_taxid(substr($tagval,6));
342 $curseq->add_SeqFeature($feat);
346 # if( $name eq 'REFERENCE') {
347 # my $ref = pop @{$self->{'_seendata'}->{'_annot'}};
348 # $curseq->annotation->add_Annotation('reference',$ref);
350 $self->SUPER::end_element($ele);
353 # Characters should be buffered because we may not always get the entire string. Once the entire string is read
354 # process it in end_element.
355 sub characters {
356 my ($self,$data) = @_;
357 if( ! @{$self->{'_state'}} ) {
358 $self->warn("Calling characters with no previous start_element call. Ignoring data");
359 } else {
360 # my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
361 # my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
362 # my $curannot = $self->{'_seendata'}->{'_annot'}->[-1];
363 # my $name = $self->{'_state'}->[-1];
365 # if ($name eq 'GBSEQ_LOCUS' ) {
366 $self->{'_characters'} .= $data->{'Data'};
368 # } elsif ($name eq 'GBSEQ_LENGTH' ) {
369 # $self->{'_characters'} .= $data->{'Data'};
371 # } elsif ($name eq 'GBSEQ_MOLTYPE' ) {
372 # $self->{'_characters'} .= $data->{'Data'};
374 # } elsif ($name eq 'GBSEQ_TOPOLOGY' ) {
375 # $self->{'_characters'} .= $data->{'Data'};
377 # } elsif ($name eq 'GBSEQ_DIVISION' ) {
378 # $self->{'_characters'} .= $data->{'Data'};
380 # } elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) {
381 # $self->{'_characters'} .= $data->{'Data'};
383 # } elsif ($name eq 'GBSEQ_DEFINITION' ) {
384 # $self->{'_characters'} .= $data->{'Data'};
386 # } elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) {
387 # $self->{'_characters'} .= $data->{'Data'};
389 # } elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) {
390 # $self->{'_characters'} .= $data->{'Data'};
392 # } elsif ($name eq 'GBSEQID' ) {
393 # $self->{'_characters'} .= $data->{'Data'};
395 # } elsif ($name eq 'GBSEQ_SOURCE') {
396 # $self->{'_characters'} .= $data->{'Data'};
398 # } elsif ($name eq 'GBSEQ_ORGANISM' ) {
399 # $self->{'_characters'} .= $data->{'Data'};
401 # } elsif ($name eq 'GBSEQ_TAXONOMY' ) {
402 # $self->{'_characters'} .= $data->{'Data'};
404 # } elsif ($name eq 'GBSEQ_COMMENT' ) {
405 # $self->{'_characters'} .= $data->{'Data'};
407 # } elsif ($name eq 'GBFEATURE_KEY' ) {
408 # $self->{'_characters'} .= $data->{'Data'};
410 # } elsif ($name eq 'GBFEATURE_LOCATION' ) {
411 # $self->{'_characters'} .= $data->{'Data'};
413 # } elsif ($name eq 'GBQUALIFIER_NAME' ) {
414 # $self->{'_characters'} .= $data->{'Data'};
416 # } elsif ($name eq 'GBQUALIFIER_VALUE' ) {
417 # $self->{'_characters'} .= $data->{'Data'};
419 # } elsif ($name eq 'GBINTERVAL_FROM' ) {
420 # $self->{'_feature'}->{'_interval_from'} = $data->{'Data'};
422 # } elsif ($name eq 'GBINTERVAL_TO' ) {
423 # $self->{'_feature'}->{'_interval_to'} = $data->{'Data'};
425 # } elsif ($name eq 'GBINTERVAL_ACCESSION' ) {
426 # $self->{'_feature'}->{'_interval_accession'} = $data->{'Data'};
428 # } elsif ($name eq 'GBSEQ_SEQUENCE' ) {
429 # $self->{'_characters'} .= $data->{'Data'};
432 $self->SUPER::characters($data);