Merge pull request #181 from bioperl/limit-dockerhub-trigger
[bioperl-live.git] / Bio / SeqIO / tinyseq.pm
blob2b9d7f5853c6d8508391d714593f15190692feda
1 # BioPerl module for Bio::SeqIO::tinyseq
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Donald Jackson, donald.jackson@bms.com
7 # Copyright Bristol-Myers Squibb
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::tinyseq - reading/writing sequences in NCBI TinySeq format
17 =head1 SYNOPSIS
19 Do not use this module directly; use the SeqIO handler system:
21 $stream = Bio::SeqIO->new( -file => $filename, -format => 'tinyseq' );
23 while ( my $seq = $stream->next_seq ) {
24 ....
27 =head1 DESCRIPTION
29 This object reads and writes Bio::Seq objects to and from TinySeq XML
30 format. A TinySeq is a lightweight XML file of sequence information,
31 analgous to FASTA format.
33 See L<http://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.mod.dtd> for the DTD.
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
61 the web:
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 SEE ALSO
67 L<Bio::SeqIO>, L<Bio::Seq>.
69 =head1 AUTHOR
71 Donald Jackson, E<lt>donald.jackson@bms.comE<gt>
73 Parts of this module and the test script were patterned after Sheldon
74 McKay's L<Bio::SeqIO::game>. If it breaks, however, it's my fault not
75 his ;).
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
84 package Bio::SeqIO::tinyseq;
86 use strict;
87 use Bio::Seq::SeqFastaSpeedFactory;
88 use Bio::Species;
89 use Bio::SeqIO::tinyseq::tinyseqHandler;
90 use XML::Parser::PerlSAX;
91 use XML::Writer;
92 use Bio::Root::Version;
94 use base qw(Bio::SeqIO);
96 sub _initialize {
97 my ($self, @args) = @_;
99 $self->SUPER::_initialize(@args);
101 unless (defined $self->sequence_factory) {
102 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new);
105 $self->{'_species_objects'} = {};
106 $self->{_parsed} = 0;
109 =head2 next_seq
111 Title : next_seq
112 Usage : $seq = $stream->next_seq()
113 Function: returns the next sequence in the stream
114 Returns : Bio::Seq object
115 Args : NONE
117 =cut
119 sub next_seq {
120 my ($self) = @_;
122 $self->_get_seqs() unless ($self->{_parsed});
124 return shift @{$self->{_seqlist}};
127 =head2 write_seq
129 Title : write_seq
130 Usage : $seq = $stream->write_seq(@sequence_objects); undef $stream
131 Function: outputs one or more sequence objects as TinySeq XML
132 Returns : 1 on success
133 Args : one or more sequence objects as TinySeq XML
135 Because the TSeq dtd includes closing tags after all sets are written,
136 the output will not be complete until the program terminates or the
137 object is forced out of scope (see close_writer()). May not perfectly
138 reproduce TSeq_sid element for all sequences
140 =cut
142 sub write_seq {
143 my ($self, @seqobjs) = @_;
145 $self->throw('write_seq must be called with at least one Bio::SeqI or Bio::PrimarySeqI compliant object')
146 unless (@seqobjs and ( $seqobjs[0]->isa('Bio::SeqI') || $seqobjs[0]->isa('Bio::PrimarySeqI')));
148 my $writer = $self->_get_writer;
150 foreach my $seqobj (@seqobjs) {
151 my ($id_element, $id_value) = $self->_get_idstring($seqobj);
152 $writer->startTag('TSeq');
153 $writer->emptyTag('TSeq_seqtype', value => $self->_convert_seqtype($seqobj));
155 $writer->dataElement('TSeq_gi', $seqobj->primary_id || '');
156 $writer->dataElement($id_element, $id_value);
157 #$writer->dataElement('TSeq_orgname', $seqobj->taxid) if ($seqobj->can('taxid')); # just a placeholder
158 $writer->dataElement('TSeq_defline', $seqobj->desc);
159 $writer->dataElement('TSeq_length', $seqobj->length);
160 $writer->dataElement('TSeq_sequence', $seqobj->seq);
162 if ($seqobj->can('species') && $seqobj->species) {
163 $self->_write_species($writer, $seqobj->species);
166 $writer->endTag('TSeq');
171 =head2 _get_seqs
173 Title : _get_seqs
174 Usage : Internal function - use next_seq() instead
175 Function: parses the XML and creates Bio::Seq objects
176 Returns : 1 on success
177 Args : NONE
179 Currently stores all sequence objects into memory. I will work on do
180 more of a stream-based approach
182 =cut
184 sub _get_seqs {
185 my ($self) = @_;
186 my $fh = $self->_fh;
188 my $handler = Bio::SeqIO::tinyseq::tinyseqHandler->new();
189 my $parser = XML::Parser::PerlSAX->new( Handler => $handler );
190 my @seqatts = $parser->parse( Source => { ByteStream => $fh });
191 my $factory = $self->sequence_factory;
193 $self->{_seqlist} ||= [];
194 foreach my $seqatt(@seqatts) {
195 foreach my $subatt(@$seqatt) { # why are there two hashes?
196 my $seqobj = $factory->create(%$subatt);
197 $self->_assign_identifier($seqobj, $subatt);
199 if ($seqobj->can('species')) {
200 # my $class = [reverse(split(/ /, $subatt->{'-organism'}))];
201 # my $species = Bio::Species->new( -classification => $class,
202 # -ncbi_taxid => $subatt->{'-taxid'} );
203 my $species = $self->_get_species($subatt->{'-organism'}, $subatt->{'-taxid'});
204 $seqobj->species($species) if ($species);
207 push(@{$self->{_seqlist}}, $seqobj);
210 $self->{_parsed} = 1;
213 =head2 _get_species
215 Title : _get_species
216 Usage : Internal function
217 Function: gets a Bio::Species object from cache or creates as needed
218 Returns : a Bio::Species object on success, undef on failure
219 Args : a classification string (eg 'Homo sapiens') and
220 a NCBI taxon id (optional)
222 Objects are cached for parsing multiple sequence files.
224 =cut
226 sub _get_species {
227 my ($self, $orgname, $taxid) = @_;
229 unless ($self->{'_species_objects'}->{$orgname}) {
230 my $species = $self->_create_species($orgname, $taxid);
231 $self->{'_species_objects'}->{$orgname} = $species;
233 return $self->{'_species_objects'}->{$orgname};
236 =head2 _create_species
238 Title : _create_species
239 Usage : Internal function
240 Function: creates a Bio::Species object
241 Returns : a Bio::Species object on success, undef on failure
242 Args : a classification string (eg 'Homo sapiens') and
243 a NCBI taxon id (optional)
245 =cut
247 sub _create_species {
248 my ($self, $orgname, $taxid) = @_;
249 return unless ($orgname); # not required in TinySeq dtd so don't throw an error
251 my %params;
252 $params{'-classification'} = [reverse(split(/ /, $orgname))];
253 $params{'-ncbi_taxid'} = $taxid if ($taxid);
255 my $species = Bio::Species->new(%params)
256 or return;
258 return $species;
262 =head2 _assign_identifier
264 Title : _assign_identifier
265 Usage : Internal function
266 Function: looks for sequence accession
267 Returns : 1 on success
268 Args : NONE
270 NCBI puts refseq accessions in TSeq_sid, others in TSeq_accver.
272 =cut
274 sub _assign_identifier {
275 my ($self, $seqobj, $atts) = @_;
276 my ($accession, $version);
278 if ($atts->{'-accver'}) {
279 ($accession, $version) = split(/\./, $atts->{'-accver'});;
281 elsif ($atts->{'-sid'}) {
282 my $sidstring =$atts->{'-sid'};
283 $sidstring =~ s/^.+?\|//;
284 $sidstring =~ s/\|[^\|]*//;
285 ($accession, $version) = split(/\./, $sidstring);;
287 else {
288 $self->throw('NO accession information found for this sequence');
290 $seqobj->accession_number($accession) if ($seqobj->can('accession_number'));
291 $seqobj->version($version) if ($seqobj->can('version'));
294 =head2 _convert_seqtype
296 Title : _convert_seqtype
297 Usage : Internal function
298 Function: maps Bio::Seq::alphabet() values [dna/rna/protein] onto
299 TSeq_seqtype values [protein/nucleotide]
301 =cut
303 sub _convert_seqtype {
304 my ($self, $seqobj) = @_;
306 return 'protein' if ($seqobj->alphabet eq 'protein');
307 return 'nucleotide' if ($seqobj->alphabet eq 'dna');
308 return 'nucleotide' if ($seqobj->alphabet eq 'rna');
310 # if we get here there's a problem!
311 $self->throw("Alphabet not defined, can't assign type for $seqobj");
314 =head2 _get_idstring
316 Title : _get_idstring
317 Usage : Internal function
318 Function: parse accession and version info from TSeq_accver
319 or TSeq_sid
321 =cut
323 sub _get_idstring {
324 # NCBI puts refseq ids in TSeq_sid, others in TSeq_accver. No idea why.
325 my ($self, $seqobj) = @_;
326 my $accver = $seqobj->accession_number;
327 $accver .= '.' . $seqobj->version if ($seqobj->can('version') and $seqobj->version);
328 if ($accver =~ /^(NM_|NP_|XM_|XP_|NT_|NC_|NG_)/) {
329 return ('TSeq_sid', join('|', 'ref', $accver, ''));
331 else {
332 return ('TSeq_accver', $accver);
336 =head2 _get_writer
338 Title : _get_writer
339 Usage : Internal function
340 Function: instantiate XML::Writer object if needed,
341 output initial XML
343 =cut
345 sub _get_writer {
346 # initialize writer, start doc so write_seq can work one at a time
347 my ($self) = @_;
349 unless ($self->{_writer}) {
350 my $fh = $self->_fh;
351 my $writer = XML::Writer->new(OUTPUT => $fh,
352 DATA_MODE => 1,
353 DATA_INDENT => 2,
354 NEWLINE => 1,
356 $writer->doctype('TSeqSet', '-//NCBI//NCBI TSeq/EN', 'http://www.ncbi.nlm.nih.gov/dtd/NCBI_TSeq.dtd');
357 $writer->comment("Generated by Bio::SeqIO::tinyseq VERSION " . $Bio::Root::Version::VERSION);
358 $writer->startTag('TSeqSet');
360 $self->{_writer} = $writer;
362 return $self->{_writer};
365 =head2 close_writer
367 Title : close_writer
368 Usage : $self->close_writer()
369 Function: terminate XML output
370 Args : NONE
371 Returns : 1 on success
373 Called automatically by DESTROY when object goes out of scope
375 =cut
377 sub close_writer {
378 # close out any dangling writer
379 my ($self) = @_;
380 if ($self->{_writer}) {
381 my $writer = $self->{_writer};
382 $writer->endTag('TSeqSet');
383 $writer->end;
384 undef $writer;
386 close($self->_fh) if ($self->_fh);
390 sub _write_species {
391 my ($self, $writer, $species) = @_;
392 $writer->dataElement('TSeq_orgname', $species->binomial);
393 $writer->dataElement('TSeq_taxid', $species->ncbi_taxid)
394 if($species->ncbi_taxid);
397 sub DESTROY {
398 # primarily to close out a writer!
399 my ($self) = @_;
400 $self->close_writer;
401 undef $self;
405 __END__