[bug 2714]
[bioperl-live.git] / Bio / SeqIO / embldriver.pm
blobe47ad21ca8d3e098e8dd90e2bb17f06a4e6fa163
1 # $Id$
3 # BioPerl module for Bio::SeqIO::embldriver
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::embldriver - EMBL sequence input/output stream
17 =head1 SYNOPSIS
19 It is probably best not to use this object directly, but
20 rather go through the SeqIO handler system. Go:
22 $stream = Bio::SeqIO->new(-file => $filename, -format => 'embldriver');
24 while ( (my $seq = $stream->next_seq()) ) {
25 # do something with $seq
28 =head1 DESCRIPTION
30 This object can transform Bio::Seq objects to and from EMBL flat
31 file databases.
33 There is a lot of flexibility here about how to dump things which
34 should be documented more fully.
36 There should be a common object that this and Genbank share (probably
37 with Swissprot). Too much of the magic is identical.
39 =head2 Optional functions
41 =over 3
43 =item _show_dna()
45 (output only) shows the dna or not
47 =item _post_sort()
49 (output only) provides a sorting func which is applied to the FTHelpers
50 before printing
52 =item _id_generation_func()
54 This is function which is called as
56 print "ID ", $func($annseq), "\n";
58 To generate the ID line. If it is not there, it generates a sensible ID
59 line using a number of tools.
61 If you want to output annotations in EMBL format they need to be
62 stored in a Bio::Annotation::Collection object which is accessible
63 through the Bio::SeqI interface method L<annotation()|annotation>.
65 The following are the names of the keys which are polled from a
66 L<Bio::Annotation::Collection> object.
68 reference - Should contain Bio::Annotation::Reference objects
69 comment - Should contain Bio::Annotation::Comment objects
70 dblink - Should contain Bio::Annotation::DBLink objects
72 =back
74 =head1 FEEDBACK
76 =head2 Mailing Lists
78 User feedback is an integral part of the evolution of this and other
79 Bioperl modules. Send your comments and suggestions preferably to one
80 of the Bioperl mailing lists. Your participation is much appreciated.
82 bioperl-l@bioperl.org - General discussion
83 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85 =head2 Reporting Bugs
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 the bugs and their resolution. Bug reports can be submitted via
89 the web:
91 http://bugzilla.open-bio.org/
93 =head1 AUTHOR - Ewan Birney
95 Email birney@ebi.ac.uk
97 =head1 APPENDIX
99 The rest of the documentation details each of the object
100 methods. Internal methods are usually preceded with a _
102 =cut
104 # Let the code begin...
106 package Bio::SeqIO::embldriver;
107 use vars qw(%FTQUAL_NO_QUOTE);
108 use strict;
109 use Bio::SeqIO::Handler::GenericRichSeqHandler;
110 use Data::Dumper;
112 use base qw(Bio::SeqIO);
114 my %FTQUAL_NO_QUOTE = map {$_ => 1} qw(
115 anticodon citation
116 codon codon_start
117 cons_splice direction
118 evidence label
119 mod_base number
120 rpt_type rpt_unit
121 transl_except transl_table
122 usedin
123 LOCATION
126 my %DATA_KEY = (
127 ID => 'ID',
128 AC => 'ACCESSION',
129 DT => 'DATE',
130 DE => 'DESCRIPTION',
131 KW => 'KEYWORDS',
132 OS => 'SOURCE',
133 OC => 'CLASSIFICATION',
134 OG => 'ORGANELLE',
135 RN => 'REFERENCE',
136 RA => 'AUTHORS',
137 RC => 'COMMENT',
138 RG => 'CONSRTM',
139 RP => 'POSITION',
140 RX => 'CROSSREF',
141 RT => 'TITLE',
142 RL => 'LOCATION',
143 XX => 'SPACER',
144 FH => 'FEATHEADER',
145 FT => 'FEATURES',
146 AH => 'TPA_HEADER', # Third party annotation
147 AS => 'TPA_DATA', # Third party annotation
148 DR => 'DBLINK',
149 CC => 'COMMENT',
150 CO => 'CO',
151 CON => 'CON',
152 WGS => 'WGS',
153 ANN => 'ANN',
154 TPA => 'TPA',
155 SQ => 'SEQUENCE',
158 my %SEC = (
159 OC => 'CLASSIFICATION',
160 OH => 'HOST', # not currently handled, bundled with organism data for now
161 OG => 'ORGANELLE',
162 OX => 'CROSSREF',
163 RA => 'AUTHORS',
164 RC => 'COMMENT',
165 RG => 'CONSRTM',
166 RP => 'POSITION',
167 RX => 'CROSSREF',
168 RT => 'TITLE',
169 RL => 'JOURNAL',
170 AS => 'ASSEMBLYINFO', # Third party annotation
173 my %DELIM = (
174 #CC => "\n",
175 #DR => "\n",
176 #DT => "\n",
179 # signals to process what's in the hash prior to next round
180 # these should be changed to map secondary data
181 my %PRIMARY = map {$_ => 1} qw(ID AC DT DE SV KW OS RN AH DR FH CC SQ FT WGS CON ANN TPA //);
183 sub _initialize {
184 my($self,@args) = @_;
186 $self->SUPER::_initialize(@args);
187 my $handler = $self->_rearrange([qw(HANDLER)],@args);
188 # hash for functions for decoding keys.
189 $handler ? $self->seqhandler($handler) :
190 $self->seqhandler(Bio::SeqIO::Handler::GenericRichSeqHandler->new(
191 -format => 'embl',
192 -verbose => $self->verbose,
193 -builder => $self->sequence_builder
196 if( ! defined $self->sequence_factory ) {
197 $self->sequence_factory(Bio::Seq::SeqFactory->new
198 (-verbose => $self->verbose(),
199 -type => 'Bio::Seq::RichSeq'));
203 =head2 next_seq
205 Title : next_seq
206 Usage : $seq = $stream->next_seq()
207 Function: returns the next sequence in the stream
208 Returns : Bio::Seq object
209 Args :
211 =cut
213 sub next_seq {
214 my $self = shift;
215 my $hobj = $self->seqhandler;
216 local($/) = "\n";
217 my ($featkey, $qual, $annkey, $delim, $seqdata);
218 my $lastann = '';
219 my $ct = 0;
220 PARSER:
221 while(defined(my $line = $self->_readline)) {
222 next PARSER if $line =~ m{^\s*$};
223 chomp $line;
224 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
225 next PARSER if ($ann eq 'XX' || $ann eq 'FH');
226 if ($ann) {
227 $data ||='';
228 if ($ann eq 'FT') {
229 # seqfeatures
230 if ($data =~ m{^(\S+)\s+([^\n]+)}) {
231 $hobj->data_handler($seqdata) if $seqdata;
232 $seqdata = ();
233 ($seqdata->{FEATURE_KEY}, $data) = ($1, $2);
234 $seqdata->{NAME} = $ann;
235 $qual = 'LOCATION';
236 } elsif ($data =~ m{^\s+/([^=]+)=?(.+)?}) {
237 ($qual, $data) = ($1, $2 ||'');
238 $ct = (exists $seqdata->{$qual}) ?
239 ((ref($seqdata->{$qual})) ? scalar(@{ $seqdata->{$qual} }) : 1)
240 : 0 ;
242 $data =~ s{^\s+}{};
243 $data =~ tr{"}{}d; # we don't care about quotes yet...
244 my $delim = ($FTQUAL_NO_QUOTE{$qual}) ? '' : ' ';
245 if ($ct == 0) {
246 $seqdata->{$qual} .= ($seqdata->{$qual}) ?
247 $delim.$data :
248 $data;
249 } else {
250 if (!ref($seqdata->{$qual})) {
251 $seqdata->{$qual} = [$seqdata->{$qual}];
253 (exists $seqdata->{$qual}->[$ct]) ?
254 (($seqdata->{$qual}->[$ct]) .= $delim.$data) :
255 (($seqdata->{$qual}->[$ct]) .= $data);
257 } else {
258 # simple annotations
259 $data =~ s{;$}{};
260 last PARSER if $ann eq '//';
261 if ($ann ne $lastann) {
262 if (!$SEC{$ann} && $seqdata) {
263 $hobj->data_handler($seqdata);
264 # can't use undef here; it can lead to subtle mem leaks
265 $seqdata = ();
267 $annkey = (!$SEC{$ann}) ? 'DATA' : # primary data
268 $SEC{$ann};
269 $seqdata->{'NAME'} = $ann if !$SEC{$ann};
272 # toss the data for SQ lines; this needs to be done after the
273 # call to the data handler
275 next PARSER if $ann eq 'SQ';
276 my $delim = $DELIM{$ann} || ' ';
277 $seqdata->{$annkey} .= ($seqdata->{$annkey}) ?
278 $delim.$data : $data;
279 $lastann = $ann;
281 } else {
282 # this should only be sequence (fingers crossed!)
283 SEQUENCE:
284 while (defined ($line = $self->_readline)) {
285 if (index($line, '//') == 0) {
286 $data =~ tr{0-9 \n}{}d;
287 $seqdata->{DATA} = $data;
288 #$self->debug(Dumper($seqdata));
289 $hobj->data_handler($seqdata);
290 $seqdata = ();
291 last PARSER;
292 } else {
293 $data .= $line;
294 $line = undef;
299 $hobj->data_handler($seqdata) if $seqdata;
300 $seqdata = ();
301 return $hobj->build_sequence;
304 sub next_chunk {
305 my $self = shift;
306 my $ct = 0;
307 PARSER:
308 while(defined(my $line = $self->_readline)) {
309 next if $line =~ m{^\s*$};
310 chomp $line;
311 my ($ann,$data) = split m{\s{2,3}}, $line , 2;
312 $data ||= '';
313 $self->debug("Ann: [$ann]\n\tData: [$data]\n");
314 last PARSER if $ann =~ m{//};
318 =head2 write_seq
320 Title : write_seq
321 Usage : $stream->write_seq($seq)
322 Function: writes the $seq object (must be seq) to the stream
323 Returns : 1 for success and 0 for error
324 Args : array of 1 to n Bio::SeqI objects
326 =cut
328 sub write_seq {
329 shift->throw("Use Bio::SeqIO::embl for output");
330 # maybe make a Writer class as well????
333 =head2 seqhandler
335 Title : seqhandler
336 Usage : $stream->seqhandler($handler)
337 Function: Get/Set teh Bio::Seq::HandlerBaseI object
338 Returns : Bio::Seq::HandlerBaseI
339 Args : Bio::Seq::HandlerBaseI
341 =cut
343 sub seqhandler {
344 my ($self, $handler) = @_;
345 if ($handler) {
346 $self->throw("Not a Bio::HandlerBaseI") unless
347 ref($handler) && $handler->isa("Bio::HandlerBaseI");
348 $self->{'_seqhandler'} = $handler;
350 return $self->{'_seqhandler'};
355 __END__