sync w/ main trunk
[bioperl-live.git] / Bio / SeqIO / kegg.pm
bloba8ce6443cb1206949e03248e9ba289b1713968ae
1 # $Id$
3 # BioPerl module for Bio::SeqIO::kegg
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Allen Day <allenday@ucla.edu>
9 # Copyright Allen Day
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqIO::kegg - KEGG sequence input/output stream
19 =head1 SYNOPSIS
21 # It is probably best not to use this object directly, but
22 # rather go through the SeqIO handler system. Go:
24 use Bio::SeqIO;
26 $stream = Bio::SeqIO->new(-file => $filename, -format => 'KEGG');
28 while ( my $seq = $stream->next_seq() ) {
29 # do something with $seq
32 =head1 DESCRIPTION
34 This class transforms KEGG gene records into Bio::Seq objects.
36 =head2 Mapping of record properties to object properties
38 This section is supposed to document which sections and properties of
39 a KEGG databank record end up where in the Bioperl object model. It
40 is far from complete and presently focuses only on those mappings
41 which may be non-obvious. $seq in the text refers to the
42 Bio::Seq::RichSeqI implementing object returned by the parser for each
43 record.
45 =over 4
47 =item 'ENTRY'
49 $seq->primary_id
51 =item 'NAME'
53 $seq->display_id
55 =item 'DEFINITION'
57 $seq->annotation->get_Annotations('description');
59 =item 'ORTHOLOG'
61 grep {$_->database eq 'KO'} $seq->annotation->get_Annotations('dblink')
63 =item 'CLASS'
65 grep {$_->database eq 'PATH'}
66 $seq->annotation->get_Annotations('dblink')
68 =item 'POSITION'
70 FIXME, NOT IMPLEMENTED
72 =item 'PATHWAY'
74 for my $pathway ( $seq->annotation->get_Annotations('pathway') ) {
78 =item 'DBLINKS'
80 $seq->annotation->get_Annotations('dblink')
82 =item 'CODON_USAGE'
84 FIXME, NOT IMPLEMENTED
86 =item 'AASEQ'
88 $seq->translate->seq
90 =item 'NTSEQ'
92 $seq-E<gt>seq
94 =back
96 =head1 FEEDBACK
98 =head2 Mailing Lists
100 User feedback is an integral part of the evolution of this and other
101 Bioperl modules. Send your comments and suggestions preferably to one
102 of the Bioperl mailing lists. Your participation is much appreciated.
104 bioperl-l@bioperl.org - General discussion
105 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
107 =head2 Support
109 Please direct usage questions or support issues to the mailing list:
111 L<bioperl-l@bioperl.org>
113 rather than to the module maintainer directly. Many experienced and
114 reponsive experts will be able look at the problem and quickly
115 address it. Please include a thorough description of the problem
116 with code and data examples if at all possible.
118 =head2 Reporting Bugs
120 Report bugs to the Bioperl bug tracking system to help us keep track
121 the bugs and their resolution. Bug reports can be submitted via the web:
123 http://bugzilla.open-bio.org/
125 =head1 AUTHOR - Allen Day
127 Email allenday@ucla.edu
129 =head1 APPENDIX
131 The rest of the documentation details each of the object
132 methods. Internal methods are usually preceded with a _
134 =cut
136 # Let the code begin...
138 package Bio::SeqIO::kegg;
139 use strict;
141 use Bio::SeqFeature::Generic;
142 use Bio::Species;
143 use Bio::Seq::SeqFactory;
144 use Bio::Annotation::Collection;
145 use Bio::Annotation::Comment;
146 use Bio::Annotation::DBLink;
148 use base qw(Bio::SeqIO);
150 sub _initialize {
151 my($self,@args) = @_;
153 $self->SUPER::_initialize(@args);
154 # hash for functions for decoding keys.
155 $self->{'_func_ftunit_hash'} = {};
156 if( ! defined $self->sequence_factory ) {
157 $self->sequence_factory(Bio::Seq::SeqFactory->new
158 (-verbose => $self->verbose(),
159 -type => 'Bio::Seq::RichSeq'));
163 =head2 next_seq
165 Title : next_seq
166 Usage : $seq = $stream->next_seq()
167 Function: returns the next sequence in the stream
168 Returns : Bio::Seq::RichSeq object
169 Args :
171 =cut
173 sub next_seq {
174 my ($self,@args) = @_;
175 my $builder = $self->sequence_builder();
176 my $seq;
177 my %params;
179 my $buffer;
180 my (@acc, @features);
181 my ($display_id, $annotation);
182 my $species;
184 # initialize; we may come here because of starting over
185 @features = ();
186 $annotation = undef;
187 @acc = ();
188 $species = undef;
189 %params = (-verbose => $self->verbose); # reset hash
190 local($/) = "///\n";
192 $buffer = $self->_readline();
194 return if( !defined $buffer ); # end of file
195 $buffer =~ /^ENTRY/ ||
196 $self->throw("KEGG stream with bad ENTRY line. Not KEGG in my book. Got $buffer'");
198 my %FIELDS;
199 my @chunks = split /\n(?=\S)/, $buffer;
201 foreach my $chunk (@chunks){
202 my($key) = $chunk =~ /^(\S+)/;
203 $FIELDS{$key} = $chunk;
206 # changing to split method to get entry_ids that include
207 # sequence version like Whatever.1
208 my(undef,$entry_id,$entry_seqtype,$entry_species) =
209 split(' ',$FIELDS{ENTRY});
211 my($name);
212 if ($FIELDS{NAME}) {
213 ($name) = $FIELDS{NAME} =~ /^NAME\s+(.+)$/;
216 my( $definition, $aa_length, $aa_seq, $nt_length, $nt_seq );
218 if(( exists $FIELDS{DEFINITION} ) and ( $FIELDS{DEFINITION} =~ /^DEFINITION/ )) {
219 ($definition) = $FIELDS{DEFINITION} =~ /^DEFINITION\s+(.+)$/s;
220 $definition =~ s/\s+/ /gs;
222 if(( exists $FIELDS{AASEQ} ) and ( $FIELDS{AASEQ} =~ /^AASEQ/ )) {
223 ($aa_length,$aa_seq) = $FIELDS{AASEQ} =~ /^AASEQ\s+(\d+)\n(.+)$/s;
224 $aa_seq =~ s/\s+//g;
226 if(( exists $FIELDS{NTSEQ} ) and ( $FIELDS{NTSEQ} =~ /^NTSEQ/ )) {
227 ($nt_length,$nt_seq) = $FIELDS{NTSEQ} =~ /^NTSEQ\s+(\d+)\n(.+)$/s;
228 $nt_seq =~ s/\s+//g;
231 $annotation = Bio::Annotation::Collection->new();
233 $annotation->add_Annotation('description',
234 Bio::Annotation::Comment->new(-text => $definition));
236 $annotation->add_Annotation('aa_seq',
237 Bio::Annotation::Comment->new(-text => $aa_seq));
239 my($ortholog_db,$ortholog_id,$ortholog_desc);
240 if ($FIELDS{ORTHOLOG}) {
241 ($ortholog_db,$ortholog_id,$ortholog_desc) = $FIELDS{ORTHOLOG}
242 =~ /^ORTHOLOG\s+(\S+):\s+(\S+)\s+(.*?)$/;
244 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
245 -database => $ortholog_db,
246 -primary_id => $ortholog_id,
247 -comment => $ortholog_desc) );
250 if($FIELDS{MOTIF}){
251 $FIELDS{MOTIF} =~ s/^MOTIF\s+//;
252 while($FIELDS{MOTIF} =~/\s*?(\S+):\s+(.+?)$/mg){
253 my $db = $1;
254 my $ids = $2;
255 foreach my $id (split(/\s+/, $ids)){
257 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
258 -database =>$db,
259 -primary_id => $id,
260 -comment => "") );
265 if($FIELDS{PATHWAY}) {
266 $FIELDS{PATHWAY} =~ s/^PATHWAY\s+//;
267 while($FIELDS{PATHWAY} =~ /\s*PATH:\s+(.+)$/mg){
268 $annotation->add_Annotation('pathway',
269 Bio::Annotation::Comment->new(-text => "$1"));
273 if($FIELDS{POSITION}) {
274 $FIELDS{POSITION} =~ s/^POSITION\s+//;
275 $annotation->add_Annotation('position',
276 Bio::Annotation::Comment->new(-text => $FIELDS{POSITION}));
279 if ($FIELDS{CLASS}) {
280 $FIELDS{CLASS} =~ s/^CLASS\s+//;
281 $FIELDS{'CLASS'} =~ s/\n//g;
282 while($FIELDS{CLASS} =~ /(.*?)\[(\S+):(\S+)\]/g){
283 my ($pathway,$db,$id) = ($1,$2,$3);
284 $pathway =~ s/\s+/ /g;
285 $pathway =~ s/\s$//g;
286 $pathway =~ s/^\s+//;
287 $annotation->add_Annotation('pathway',
288 Bio::Annotation::Comment->new(-text => $pathway));
290 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
291 -database => $db, -primary_id => $id));
295 if($FIELDS{DBLINKS}) {
296 $FIELDS{DBLINKS} =~ s/^DBLINKS/ /;
297 while($FIELDS{DBLINKS} =~ /\s+(\S+):\s+(\S+)\n?/gs){ ### modified
298 $annotation->add_Annotation('dblink',Bio::Annotation::DBLink->new(
299 -database => $1, -primary_id => $2)) if $1;
303 $params{'-alphabet'} = 'dna';
304 $params{'-seq'} = $nt_seq;
305 $params{'-display_id'} = $name;
306 $params{'-accession_number'} = $entry_id;
307 $params{'-species'} = Bio::Species->new(
308 -common_name => $entry_species);
309 $params{'-annotation'} = $annotation;
311 $builder->add_slot_value(%params);
312 $seq = $builder->make_object();
314 return $seq;
317 =head2 write_seq
319 Title : write_seq
320 Note : write_seq() is not implemented for KEGG format output.
322 =cut
324 sub write_seq {
325 shift->throw("write_seq() not implemented for KEGG format output.");