sync w/ main trunk
[bioperl-live.git] / Bio / Tools / Genscan.pm
blobee84427847fe74bf865c372111cb67d18d116bf9
1 # $Id$
3 # BioPerl module for Bio::Tools::Genscan
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Hilmar Lapp <hlapp@gmx.net>
9 # Copyright Hilmar Lapp
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Genscan - Results of one Genscan run
19 =head1 SYNOPSIS
21 use Bio::Tools::Genscan;
23 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
24 # filehandle:
25 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
27 # parse the results
28 # note: this class is-a Bio::Tools::AnalysisResult which implements
29 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
30 while($gene = $genscan->next_prediction()) {
31 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
32 # off Bio::SeqFeature::Gene::Transcript.
34 # $gene->exons() returns an array of
35 # Bio::Tools::Prediction::Exon objects
36 # all exons:
37 @exon_arr = $gene->exons();
39 # initial exons only
40 @init_exons = $gene->exons('Initial');
41 # internal exons only
42 @intrl_exons = $gene->exons('Internal');
43 # terminal exons only
44 @term_exons = $gene->exons('Terminal');
45 # singleton exons:
46 ($single_exon) = $gene->exons();
49 # essential if you gave a filename at initialization (otherwise the file
50 # will stay open)
51 $genscan->close();
53 =head1 DESCRIPTION
55 The Genscan module provides a parser for Genscan gene structure prediction
56 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
57 derived object.
59 This module also implements the Bio::SeqAnalysisParserI interface, and thus
60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
62 =head1 FEEDBACK
64 =head2 Mailing Lists
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to one
68 of the Bioperl mailing lists. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 =head2 Support
75 Please direct usage questions or support issues to the mailing list:
77 L<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
84 =head2 Reporting Bugs
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 the bugs and their resolution. Bug reports can be submitted via the
88 web:
90 http://bugzilla.open-bio.org/
92 =head1 AUTHOR - Hilmar Lapp
94 Email hlapp@gmx.net
96 =head1 APPENDIX
98 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
100 =cut
103 # Let the code begin...
106 package Bio::Tools::Genscan;
107 use strict;
108 use Symbol;
110 use Bio::Root::Root;
111 use Bio::Tools::Prediction::Gene;
112 use Bio::Tools::Prediction::Exon;
114 use base qw(Bio::Tools::AnalysisResult);
116 my %ExonTags = ('Init' => 'Initial',
117 'Intr' => 'Internal',
118 'Term' => 'Terminal',
119 'Sngl' => '');
121 sub _initialize_state {
122 my ($self,@args) = @_;
124 # first call the inherited method!
125 $self->SUPER::_initialize_state(@args);
127 # our private state variables
128 $self->{'_preds_parsed'} = 0;
129 $self->{'_has_cds'} = 0;
130 # array of pre-parsed predictions
131 $self->{'_preds'} = [];
132 # seq stack
133 $self->{'_seqstack'} = [];
136 =head2 analysis_method
138 Usage : $genscan->analysis_method();
139 Purpose : Inherited method. Overridden to ensure that the name matches
140 /genscan/i.
141 Returns : String
142 Argument : n/a
144 =cut
146 #-------------
147 sub analysis_method {
148 #-------------
149 my ($self, $method) = @_;
150 if($method && ($method !~ /genscan/i)) {
151 $self->throw("method $method not supported in " . ref($self));
153 return $self->SUPER::analysis_method($method);
156 =head2 next_feature
158 Title : next_feature
159 Usage : while($gene = $genscan->next_feature()) {
160 # do something
162 Function: Returns the next gene structure prediction of the Genscan result
163 file. Call this method repeatedly until FALSE is returned.
165 The returned object is actually a SeqFeatureI implementing object.
166 This method is required for classes implementing the
167 SeqAnalysisParserI interface, and is merely an alias for
168 next_prediction() at present.
170 Example :
171 Returns : A Bio::Tools::Prediction::Gene object.
172 Args :
174 =cut
176 sub next_feature {
177 my ($self,@args) = @_;
178 # even though next_prediction doesn't expect any args (and this method
179 # does neither), we pass on args in order to be prepared if this changes
180 # ever
181 return $self->next_prediction(@args);
184 =head2 next_prediction
186 Title : next_prediction
187 Usage : while($gene = $genscan->next_prediction()) {
188 # do something
190 Function: Returns the next gene structure prediction of the Genscan result
191 file. Call this method repeatedly until FALSE is returned.
193 Example :
194 Returns : A Bio::Tools::Prediction::Gene object.
195 Args :
197 =cut
199 sub next_prediction {
200 my ($self) = @_;
201 my $gene;
203 # if the prediction section hasn't been parsed yet, we do this now
204 $self->_parse_predictions() unless $self->_predictions_parsed();
206 # get next gene structure
207 $gene = $self->_prediction();
209 if($gene) {
210 # fill in predicted protein, and if available the predicted CDS
212 my ($id, $seq);
213 # use the seq stack if there's a seq on it
214 my $seqobj = pop(@{$self->{'_seqstack'}});
215 if(! $seqobj) {
216 # otherwise read from input stream
217 ($id, $seq) = $self->_read_fasta_seq();
218 # there may be no sequence at all, or none any more
219 if($id && $seq) {
220 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
221 '-display_id' => $id,
222 '-alphabet' => "protein");
225 if($seqobj) {
226 # check that prediction number matches the prediction number
227 # indicated in the sequence id (there may be incomplete gene
228 # predictions that contain only signals with no associated protein
229 # and CDS, like promoters, poly-A sites etc)
230 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
231 my $prednr = $1;
232 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
233 # this is not our sequence, so push back for next prediction
234 push(@{$self->{'_seqstack'}}, $seqobj);
235 } else {
236 $gene->predicted_protein($seqobj);
237 # CDS prediction, too?
238 if($self->_has_cds()) {
239 ($id, $seq) = $self->_read_fasta_seq();
240 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
241 '-display_id' => $id,
242 '-alphabet' => "dna");
243 $gene->predicted_cds($seqobj);
249 return $gene;
252 =head2 _parse_predictions
254 Title : _parse_predictions()
255 Usage : $obj->_parse_predictions()
256 Function: Parses the prediction section. Automatically called by
257 next_prediction() if not yet done.
258 Example :
259 Returns :
261 =cut
263 sub _parse_predictions {
264 my ($self) = @_;
265 my $gene;
266 my $seqname;
268 while(defined($_ = $self->_readline())) {
269 if(/^\s*(\d+)\.(\d+)/) {
270 # exon or signal
271 my $prednr = $1;
272 my $signalnr = $2; # not used presently
273 if(! defined($gene)) {
274 $gene = Bio::Tools::Prediction::Gene->new(
275 '-primary' => "GenePrediction$prednr",
276 '-source' => 'Genscan');
278 # split into fields
279 chomp();
280 my @flds = split(' ', $_);
281 # create the feature object depending on the type of signal
282 my $predobj;
283 my $is_exon = grep {$_ eq $flds[1];} (keys(%ExonTags));
284 if($is_exon) {
285 $predobj = Bio::Tools::Prediction::Exon->new();
286 } else {
287 # PolyA site, or Promoter
288 $predobj = Bio::SeqFeature::Generic->new();
290 # set common fields
291 $predobj->source_tag('Genscan');
292 $predobj->score($flds[$#flds]);
293 $predobj->strand((($flds[2] eq '+') ? 1 : -1));
294 my ($start, $end) = @flds[(3,4)];
295 if($predobj->strand() == 1) {
296 $predobj->start($start);
297 $predobj->end($end);
298 } else {
299 $predobj->end($start);
300 $predobj->start($end);
302 # add to gene structure (should be done only when start and end
303 # are set, in order to allow for proper expansion of the range)
304 if($is_exon) {
305 # first, set fields unique to exons
306 $predobj->start_signal_score($flds[8]);
307 $predobj->end_signal_score($flds[9]);
308 $predobj->coding_signal_score($flds[10]);
309 $predobj->significance($flds[11]);
310 $predobj->primary_tag($ExonTags{$flds[1]} . 'Exon');
311 $predobj->is_coding(1);
312 # Figure out the frame of this exon. This is NOT the frame
313 # given by Genscan, which is the absolute frame of the base
314 # starting the first predicted complete codon. By comparing
315 # to the absolute frame of the first base we can compute the
316 # offset of the first complete codon to the first base of the
317 # exon, which determines the frame of the exon.
318 my $cod_offset;
319 if($predobj->strand() == 1) {
320 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
321 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
322 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
323 $cod_offset += 3 if($cod_offset < 1);
324 } else {
325 # On the reverse strand the Genscan frame also refers to
326 # the first base of the first complete codon, but viewed
327 # from forward, which is the third base viewed from
328 # reverse.
329 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
330 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
331 # situation, {2,-1} and {1,-2} correspond to offsets
332 # 1 and 2, resp. Offset 3 is the same as 0.
333 $cod_offset -= 3 if($cod_offset >= 0);
334 $cod_offset = -$cod_offset;
336 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
337 # is the frame of the first base relative to the exon, or the
338 # number of bases the first codon is missing).
339 $predobj->frame(3 - $cod_offset);
340 # then add to gene structure object
341 $gene->add_exon($predobj, $ExonTags{$flds[1]});
342 } elsif($flds[1] eq 'PlyA') {
343 $predobj->primary_tag("PolyAsite");
344 $gene->poly_A_site($predobj);
345 } elsif($flds[1] eq 'Prom') {
346 $predobj->primary_tag("Promoter");
347 $gene->add_promoter($predobj);
349 next;
351 if(/^\s*$/ && defined($gene)) {
352 # current gene is completed
353 $gene->seq_id($seqname);
354 $self->_add_prediction($gene);
355 $gene = undef;
356 next;
358 if(/^(GENSCAN)\s+(\S+)/) {
359 $self->analysis_method($1);
360 $self->analysis_method_version($2);
361 next;
363 if(/^Sequence\s+(\S+)\s*:/) {
364 $seqname = $1;
365 next;
368 if(/^Parameter matrix:\s+(\S+)/i) {
369 $self->analysis_subject($1);
370 next;
373 if(/^Predicted coding/) {
374 $self->_has_cds(1);
375 next;
377 /^>/ && do {
378 # section of predicted sequences
379 $self->_pushback($_);
380 last;
383 $self->_predictions_parsed(1);
386 =head2 _prediction
388 Title : _prediction()
389 Usage : $gene = $obj->_prediction()
390 Function: internal
391 Example :
392 Returns :
394 =cut
396 sub _prediction {
397 my ($self) = @_;
399 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
400 return shift(@{$self->{'_preds'}});
403 =head2 _add_prediction
405 Title : _add_prediction()
406 Usage : $obj->_add_prediction($gene)
407 Function: internal
408 Example :
409 Returns :
411 =cut
413 sub _add_prediction {
414 my ($self, $gene) = @_;
416 if(! exists($self->{'_preds'})) {
417 $self->{'_preds'} = [];
419 push(@{$self->{'_preds'}}, $gene);
422 =head2 _predictions_parsed
424 Title : _predictions_parsed
425 Usage : $obj->_predictions_parsed
426 Function: internal
427 Example :
428 Returns : TRUE or FALSE
430 =cut
432 sub _predictions_parsed {
433 my ($self, $val) = @_;
435 $self->{'_preds_parsed'} = $val if $val;
436 if(! exists($self->{'_preds_parsed'})) {
437 $self->{'_preds_parsed'} = 0;
439 return $self->{'_preds_parsed'};
442 =head2 _has_cds
444 Title : _has_cds()
445 Usage : $obj->_has_cds()
446 Function: Whether or not the result contains the predicted CDSs, too.
447 Example :
448 Returns : TRUE or FALSE
450 =cut
452 sub _has_cds {
453 my ($self, $val) = @_;
455 $self->{'_has_cds'} = $val if $val;
456 if(! exists($self->{'_has_cds'})) {
457 $self->{'_has_cds'} = 0;
459 return $self->{'_has_cds'};
462 =head2 _read_fasta_seq
464 Title : _read_fasta_seq()
465 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
466 Function: Simple but specialised FASTA format sequence reader. Uses
467 $self->_readline() to retrieve input, and is able to strip off
468 the traling description lines.
469 Example :
470 Returns : An array of two elements.
472 =cut
474 sub _read_fasta_seq {
475 my ($self) = @_;
476 my ($id, $seq);
477 local $/ = ">";
479 my $entry = $self->_readline();
480 if($entry) {
481 $entry =~ s/^>//;
482 # complete the entry if the first line came from a pushback buffer
483 while($entry !~ />$/) {
484 last unless $_ = $self->_readline();
485 $entry .= $_;
487 # delete everything onwards from an intervening empty line (at the
488 # end there might be statistics stuff)
489 $entry =~ s/\n\n.*$//s;
490 # id and sequence
491 if($entry =~ /^(\S+)\n([^>]+)/) {
492 $id = $1;
493 $seq = $2;
494 } else {
495 $self->throw("Can't parse Genscan predicted sequence entry");
497 $seq =~ s/\s//g; # Remove whitespace
499 return ($id, $seq);