tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Tools / Fgenesh.pm
blob927027a05096995a704187717a97590fc35d3d32
1 # $Id$
3 # BioPerl module for Bio::Tools::Fgenesh
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Christopher Dwan (chris@dwan.org)
9 # Copied, lock stock & barrel from Genscan.pm
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Fgenesh - parse results of one Fgenesh run
19 =head1 SYNOPSIS
21 use Bio::Tools::Fgenesh;
23 $fgenesh = Bio::Tools::Fgenesh->new(-file => 'result.fgenesh');
24 # filehandle:
25 $fgenesh = Bio::Tools::Fgenesh->new( -fh => \*INPUT );
27 # parse the results
28 # note: this class is-a Bio::Tools::AnalysisResult which implements
29 # Bio::SeqAnalysisParserI, i.e., $fgensh->next_feature() is the same
30 while($gene = $fgenesh->next_prediction()) {
31 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
32 # off Bio::SeqFeature::Gene::Transcript.
34 # $gene->exons() returns an array of
35 # Bio::Tools::Prediction::Exon objects
36 # all exons:
37 @exon_arr = $gene->exons();
39 # initial exons only
40 @init_exons = $gene->exons('Initial');
41 # internal exons only
42 @intrl_exons = $gene->exons('Internal');
43 # terminal exons only
44 @term_exons = $gene->exons('Terminal');
45 # singleton exons:
46 ($single_exon) = $gene->exons();
49 # essential if you gave a filename at initialization (otherwise the file
50 # will stay open)
51 $fgenesh->close();
53 =head1 DESCRIPTION
55 The Fgenesh module provides a parser for Fgenesh (version 2) gene structure
56 prediction output. It parses one gene prediction into a
57 Bio::SeqFeature::Gene::Transcript- derived object.
59 This module also implements the L<Bio::SeqAnalysisParserI> interface, and thus
60 can be used wherever such an object fits.
62 =head1 FEEDBACK
64 =head2 Mailing Lists
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to one
68 of the Bioperl mailing lists. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 =head2 Support
75 Please direct usage questions or support issues to the mailing list:
77 I<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
84 =head2 Reporting Bugs
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 the bugs and their resolution. Bug reports can be submitted via the
88 web:
90 http://bugzilla.open-bio.org/
92 =head1 AUTHOR - Chris Dwan
94 Email chris-at-dwan.org
96 =head1 APPENDIX
98 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
100 =cut
103 # Let the code begin...
106 package Bio::Tools::Fgenesh;
107 use strict;
108 use Symbol;
110 use Bio::Root::Root;
111 use Bio::Tools::Prediction::Gene;
112 use Bio::Tools::Prediction::Exon;
114 use base qw(Bio::Tools::AnalysisResult);
116 my %ExonTags = ('CDSf' => 'Initial',
117 'CDSi' => 'Internal',
118 'CDSl' => 'Terminal',
119 'CDSo' => 'Singleton');
121 sub _initialize_state {
122 my ($self,@args) = @_;
124 # first call the inherited method!
125 $self->SUPER::_initialize_state(@args);
127 # our private state variables
128 $self->{'_preds_parsed'} = 0;
129 $self->{'_has_cds'} = 0;
130 # array of pre-parsed predictions
131 $self->{'_preds'} = [];
132 # seq stack
133 $self->{'_seqstack'} = [];
136 =head2 analysis_method
138 Usage : $genscan->analysis_method();
139 Purpose : Inherited method. Overridden to ensure that the name matches
140 /genscan/i.
141 Returns : String
142 Argument : n/a
144 =cut
146 #-------------
147 sub analysis_method {
148 #-------------
149 my ($self, $method) = @_;
150 if($method && ($method !~ /fgenesh/i)) {
151 $self->throw("method $method not supported in " . ref($self));
153 return $self->SUPER::analysis_method($method);
156 =head2 next_feature
158 Title : next_feature
159 Usage : while($gene = $fgenesh->next_feature()) {
160 # do something
162 Function: Returns the next gene structure prediction of the Fgenesh result
163 file. Call this method repeatedly until FALSE is returned.
165 The returned object is actually a SeqFeatureI implementing object.
166 This method is required for classes implementing the
167 SeqAnalysisParserI interface, and is merely an alias for
168 next_prediction() at present.
170 Example :
171 Returns : A Bio::Tools::Prediction::Gene object.
172 Args :
174 =cut
176 sub next_feature {
177 my ($self,@args) = @_;
178 # even though next_prediction doesn't expect any args (and this method
179 # does neither), we pass on args in order to be prepared if this changes
180 # ever
181 return $self->next_prediction(@args);
184 =head2 next_prediction
186 Title : next_prediction
187 Usage : while($gene = $fgenesh->next_prediction()) { ... }
188 Function: Returns the next gene structure prediction of the Genscan result
189 file. Call this method repeatedly until FALSE is returned.
190 Example :
191 Returns : A Bio::Tools::Prediction::Gene object.
192 Args :
194 =cut
196 sub next_prediction {
197 my ($self) = @_;
198 my $gene;
200 # if the prediction section hasn't been parsed yet, we do this now
201 $self->_parse_predictions() unless $self->_predictions_parsed();
203 # get next gene structure
204 $gene = $self->_prediction();
206 if($gene) {
207 # fill in predicted protein, and if available the predicted CDS
210 # use the seq stack if there's a seq on it
211 my $seqobj = pop(@{$self->{'_seqstack'}});
212 my ($id, $seq);
213 unless ($seqobj) {
214 ($id, $seq) = $self->_read_fasta_seq();
215 my $alphabet;
216 if (($id =~ /mrna/) || ($id =~ /cds/)) { $alphabet = 'dna'; }
217 else { $alphabet = 'protein'; }
218 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
219 '-display_id' => $id,
220 '-alphabet' => $alphabet);
222 if ($seqobj) {
224 # check that prediction number matches the prediction number
225 # indicated in the sequence id (there may be incomplete gene
226 # predictions that contain only signals with no associated protein
227 # prediction.
229 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
230 my $prednr = $1;
231 if ($id !~ /_predicted_(\w+)_$prednr/) {
232 # this is not our sequence, so push back for next prediction
233 push(@{$self->{'_seqstack'}}, $seqobj);
234 } else {
235 if ($1 eq "protein") {
236 $gene->predicted_protein($seqobj);
237 } elsif (($1 eq "mrna") || ($1 eq "cds")) {
238 $self->_has_cds(1);
239 $gene->predicted_cds($seqobj);
241 # Have to go back in and get the protein...
242 ($id, $seq) = $self->_read_fasta_seq();
243 if ($id =~ /_cds_/) {
244 ($id, $seq) = $self->_read_fasta_seq();
247 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
248 '-display_id' => $id,
249 '-alphabet' => "protein");
250 $gene->predicted_protein($seqobj);
256 return $gene;
259 =head2 _parse_predictions
261 Title : _parse_predictions()
262 Usage : $obj->_parse_predictions()
263 Function: Parses the prediction section. Automatically called by
264 next_prediction() if not yet done.
265 Example :
266 Returns :
268 =cut
270 sub _parse_predictions {
271 my ($self) = @_;
272 my $gene;
273 my $seqname;
275 while(defined($_ = $self->_readline())) {
277 if(/^\s*(\d+)\s+([+\-])/) {
278 my $line = $_;
280 # exon or signal
281 my $prednr = $1;
282 my $strand = ($2 eq '+') ? 1 : -1;
284 if(! defined($gene)) {
285 $gene = Bio::Tools::Prediction::Gene->new(
286 '-primary' => "GenePrediction$prednr",
287 '-source' => 'Fgenesh');
289 # split into fields
290 chomp();
291 my @flds = split(/\s+/, ' ' . $line);
292 ## NB - the above adds leading whitespace before the gene
293 ## number in case there was none (as quick patch to code
294 ## below which expects it but it is not present after 999
295 ## predictions!) This allows >999 predictions to be parsed.
297 # create the feature object depending on the type of signal
298 my $predobj;
299 my $is_exon = grep {$line =~ $_} keys(%ExonTags);
300 my ($start, $end);
301 if($is_exon) {
302 $predobj = Bio::Tools::Prediction::Exon->new();
303 $predobj->score($flds[8]);
304 $start = $flds[5];
305 $end = $flds[7];
306 } else {
307 # PolyA site, or TSS
308 $predobj = Bio::SeqFeature::Generic->new();
309 $predobj->score($flds[5]);
310 $start = $flds[4];
311 $end = $flds[4];
313 # set common fields
314 $predobj->source_tag('Fgenesh');
315 $predobj->strand($strand);
317 # Following tactical commenting-out made by
318 # malcolm.cook@stowers-institute.org since coordinate reversal is
319 # apparently vestigial copy/paste detritus from Genscan.pm origins of
320 # this module and this is NOT needed for fgenesh (at least in v
321 # 2.1.4).
323 # if($predobj->strand() == 1) {
324 $predobj->start($start);
325 $predobj->end($end);
326 # } else {
327 # $predobj->end($start);
328 # $predobj->start($end);
331 # print STDERR "start $start end $end\n";
332 # add to gene structure (should be done only when start and end
333 # are set, in order to allow for proper expansion of the range)
334 if($is_exon) {
335 # first, set fields unique to exons
336 $predobj->primary_tag($ExonTags{$flds[4]} . 'Exon');
337 $predobj->is_coding(1);
338 my $cod_offset;
339 if($predobj->strand() == 1) {
340 $cod_offset = ($flds[9] - $predobj->start()) % 3;
341 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
342 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
343 $cod_offset += 3 if($cod_offset < 1);
344 } else {
345 # On the reverse strand the Genscan frame also refers to
346 # the first base of the first complete codon, but viewed
347 # from forward, which is the third base viewed from
348 # reverse.
349 $cod_offset = ($flds[11] - $predobj->end()) % 3;
350 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
351 # situation, {2,-1} and {1,-2} correspond to offsets
352 # 1 and 2, resp. Offset 3 is the same as 0.
353 $cod_offset -= 3 if($cod_offset >= 0);
354 $cod_offset = -$cod_offset;
356 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
357 # is the frame of the first base relative to the exon, or the
358 # number of bases the first codon is missing).
359 $predobj->frame(3 - $cod_offset);
360 # print STDERR " frame is " . $predobj->frame() . "\n";
361 # then add to gene structure object
362 $gene->add_exon($predobj, $ExonTags{$flds[1]});
363 } elsif($flds[3] eq 'PolA') {
364 $predobj->primary_tag("PolyAsite");
365 $gene->poly_A_site($predobj);
366 } elsif($flds[3] eq 'TSS') {
367 $predobj->primary_tag("Promoter"); # (hey! a TSS is NOT a promoter... what's going on here?...
368 $gene->add_promoter($predobj);
369 #I'd like to do this (for now):
370 #$predobj->primary_tag("TSS"); #this is not the right model, but, it IS a feature at least.
371 #but the followg errs out
372 #$gene->add_SeqFeature($predobj); #err: MSG: start is undefined when bounds at Bio::SeqFeature::Generic::add_SeqFeature 671 check since gene has no start yet
374 else {
375 $self->throw("unrecognized prediction line: " . $line);
377 next;
380 if(/^\s*$/ && defined($gene)) {
381 # current gene is completed
382 $gene->seq_id($seqname);
383 $self->_add_prediction($gene);
384 $gene = undef;
385 next;
388 if(/^(FGENESH)\s+([\d\.]+)/) {
389 $self->analysis_method($1);
390 $self->analysis_method_version($2);
391 if (/\s(\S+)\sgenomic DNA/) {
392 $self->analysis_subject($1);
394 next;
397 if(/^\s*Seq name:\s+(\S+)/) {
398 $seqname = $1;
399 next;
402 /^Predicted protein/ && do {
403 # section of predicted sequences
404 $self->_pushback($_);
405 last;
409 $self->_predictions_parsed(1);
412 =head2 _prediction
414 Title : _prediction()
415 Usage : $gene = $obj->_prediction()
416 Function: internal
417 Example :
418 Returns :
420 =cut
422 sub _prediction {
423 my ($self) = @_;
425 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
426 return shift(@{$self->{'_preds'}});
429 =head2 _add_prediction
431 Title : _add_prediction()
432 Usage : $obj->_add_prediction($gene)
433 Function: internal
434 Example :
435 Returns :
437 =cut
439 sub _add_prediction {
440 my ($self, $gene) = @_;
442 if(! exists($self->{'_preds'})) {
443 $self->{'_preds'} = [];
445 push(@{$self->{'_preds'}}, $gene);
448 =head2 _predictions_parsed
450 Title : _predictions_parsed
451 Usage : $obj->_predictions_parsed
452 Function: internal
453 Example :
454 Returns : TRUE or FALSE
456 =cut
458 sub _predictions_parsed {
459 my ($self, $val) = @_;
461 $self->{'_preds_parsed'} = $val if $val;
462 if(! exists($self->{'_preds_parsed'})) {
463 $self->{'_preds_parsed'} = 0;
465 return $self->{'_preds_parsed'};
468 =head2 _has_cds
470 Title : _has_cds()
471 Usage : $obj->_has_cds()
472 Function: Whether or not the result contains the predicted CDSs, too.
473 Example :
474 Returns : TRUE or FALSE
476 =cut
478 sub _has_cds {
479 my ($self, $val) = @_;
481 $self->{'_has_cds'} = $val if $val;
482 if(! exists($self->{'_has_cds'})) {
483 $self->{'_has_cds'} = 0;
485 return $self->{'_has_cds'};
488 =head2 _read_fasta_seq
490 Title : _read_fasta_seq()
491 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
492 Function: Simple but specialised FASTA format sequence reader. Uses
493 $self->_readline() to retrieve input, and is able to strip off
494 the traling description lines.
495 Example :
496 Returns : An array of two elements: fasta_id & sequence
498 =cut
500 sub _read_fasta_seq {
501 my ($self) = @_;
502 my ($id, $seq);
503 #local $/ = ">";
505 my $entry = $self->_readline();
506 # print " ^^ $entry\n";
507 return unless ($entry);
508 $entry = $self->_readline() if ($entry =~ /^Predicted protein/);
509 # print " ^^ $entry\n";
511 # Pick up the header / id.
512 if ($entry =~ /^>FGENESH:/) {
513 if ($entry =~ /^>FGENESH:\s+(\d+)/) {
514 # print STDERR " this is a predicted gene\n";
515 $id = "_predicted_protein_" . $1;
516 } elsif ($entry =~ /^>FGENESH:\[mRNA\]\s+(\d+)/) {
517 # print STDERR " this is an mRNA\n";
518 $id = "_predicted_mrna_" . $1;
519 } elsif ($entry =~ /^>FGENESH:\[exon\]\s+Gene:\s+(\d+)/) {
520 $id = "_predicted_cds_" . $1;
522 $seq = "";
523 $entry = $self->_readline();
526 my $done = 0;
527 while (!$done) {
528 # print "*** $entry\n";
529 if (($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)) {
530 # print STDERR " -- informed about an exon header...\n";
531 $entry = $self->_readline();
532 } else {
533 $seq .= $entry;
534 # print STDERR " Added $entry\n";
537 last unless $entry = $self->_readline();
538 if (($entry =~ /^>/) &&
539 (!(($entry =~ /^>FGENESH:\[exon\]/) && ($id =~ /^_predicted_cds_/)))) {
540 $self->_pushback($entry); last;
544 # id and sequence
545 $seq =~ s/\s//g; # Remove whitespace
546 return ($id, $seq);