3 # BioPerl module for Bio::Tools::Genscan
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Hilmar Lapp <hlapp@gmx.net>
9 # Copyright Hilmar Lapp
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Genscan - Results of one Genscan run
21 use Bio::Tools::Genscan;
23 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
25 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
28 # note: this class is-a Bio::Tools::AnalysisResult which implements
29 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
30 while($gene = $genscan->next_prediction()) {
31 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
32 # off Bio::SeqFeature::Gene::Transcript.
34 # $gene->exons() returns an array of
35 # Bio::Tools::Prediction::Exon objects
37 @exon_arr = $gene->exons();
40 @init_exons = $gene->exons('Initial');
42 @intrl_exons = $gene->exons('Internal');
44 @term_exons = $gene->exons('Terminal');
46 ($single_exon) = $gene->exons();
49 # essential if you gave a filename at initialization (otherwise the file
55 The Genscan module provides a parser for Genscan gene structure prediction
56 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
59 This module also implements the Bio::SeqAnalysisParserI interface, and thus
60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to one
68 of the Bioperl mailing lists. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 Please direct usage questions or support issues to the mailing list:
77 L<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 the bugs and their resolution. Bug reports can be submitted via the
90 http://bugzilla.open-bio.org/
92 =head1 AUTHOR - Hilmar Lapp
98 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
103 # Let the code begin...
106 package Bio
::Tools
::Genscan
;
111 use Bio
::Tools
::Prediction
::Gene
;
112 use Bio
::Tools
::Prediction
::Exon
;
114 use base
qw(Bio::Tools::AnalysisResult);
116 my %ExonTags = ('Init' => 'Initial',
117 'Intr' => 'Internal',
118 'Term' => 'Terminal',
121 sub _initialize_state
{
122 my ($self,@args) = @_;
124 # first call the inherited method!
125 $self->SUPER::_initialize_state
(@args);
127 # our private state variables
128 $self->{'_preds_parsed'} = 0;
129 $self->{'_has_cds'} = 0;
130 # array of pre-parsed predictions
131 $self->{'_preds'} = [];
133 $self->{'_seqstack'} = [];
136 =head2 analysis_method
138 Usage : $genscan->analysis_method();
139 Purpose : Inherited method. Overridden to ensure that the name matches
147 sub analysis_method
{
149 my ($self, $method) = @_;
150 if($method && ($method !~ /genscan/i)) {
151 $self->throw("method $method not supported in " . ref($self));
153 return $self->SUPER::analysis_method
($method);
159 Usage : while($gene = $genscan->next_feature()) {
162 Function: Returns the next gene structure prediction of the Genscan result
163 file. Call this method repeatedly until FALSE is returned.
165 The returned object is actually a SeqFeatureI implementing object.
166 This method is required for classes implementing the
167 SeqAnalysisParserI interface, and is merely an alias for
168 next_prediction() at present.
171 Returns : A Bio::Tools::Prediction::Gene object.
177 my ($self,@args) = @_;
178 # even though next_prediction doesn't expect any args (and this method
179 # does neither), we pass on args in order to be prepared if this changes
181 return $self->next_prediction(@args);
184 =head2 next_prediction
186 Title : next_prediction
187 Usage : while($gene = $genscan->next_prediction()) {
190 Function: Returns the next gene structure prediction of the Genscan result
191 file. Call this method repeatedly until FALSE is returned.
194 Returns : A Bio::Tools::Prediction::Gene object.
199 sub next_prediction
{
203 # if the prediction section hasn't been parsed yet, we do this now
204 $self->_parse_predictions() unless $self->_predictions_parsed();
206 # get next gene structure
207 $gene = $self->_prediction();
210 # fill in predicted protein, and if available the predicted CDS
213 # use the seq stack if there's a seq on it
214 my $seqobj = pop(@
{$self->{'_seqstack'}});
216 # otherwise read from input stream
217 ($id, $seq) = $self->_read_fasta_seq();
218 # there may be no sequence at all, or none any more
220 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
221 '-display_id' => $id,
222 '-alphabet' => "protein");
226 # check that prediction number matches the prediction number
227 # indicated in the sequence id (there may be incomplete gene
228 # predictions that contain only signals with no associated protein
229 # and CDS, like promoters, poly-A sites etc)
230 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
232 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
233 # this is not our sequence, so push back for next prediction
234 push(@
{$self->{'_seqstack'}}, $seqobj);
236 $gene->predicted_protein($seqobj);
237 # CDS prediction, too?
238 if($self->_has_cds()) {
239 ($id, $seq) = $self->_read_fasta_seq();
240 $seqobj = Bio
::PrimarySeq
->new('-seq' => $seq,
241 '-display_id' => $id,
242 '-alphabet' => "dna");
243 $gene->predicted_cds($seqobj);
252 =head2 _parse_predictions
254 Title : _parse_predictions()
255 Usage : $obj->_parse_predictions()
256 Function: Parses the prediction section. Automatically called by
257 next_prediction() if not yet done.
263 sub _parse_predictions
{
268 while(defined($_ = $self->_readline())) {
269 if(/^\s*(\d+)\.(\d+)/) {
272 my $signalnr = $2; # not used presently
273 if(! defined($gene)) {
274 $gene = Bio
::Tools
::Prediction
::Gene
->new(
275 '-primary' => "GenePrediction$prednr",
276 '-source' => 'Genscan');
280 my @flds = split(' ', $_);
281 # create the feature object depending on the type of signal
283 my $is_exon = grep {$_ eq $flds[1];} (keys(%ExonTags));
285 $predobj = Bio
::Tools
::Prediction
::Exon
->new();
287 # PolyA site, or Promoter
288 $predobj = Bio
::SeqFeature
::Generic
->new();
291 $predobj->source_tag('Genscan');
292 $predobj->score($flds[$#flds]);
293 $predobj->strand((($flds[2] eq '+') ?
1 : -1));
294 my ($start, $end) = @flds[(3,4)];
295 if($predobj->strand() == 1) {
296 $predobj->start($start);
299 $predobj->end($start);
300 $predobj->start($end);
302 # add to gene structure (should be done only when start and end
303 # are set, in order to allow for proper expansion of the range)
305 # first, set fields unique to exons
306 $predobj->start_signal_score($flds[8]);
307 $predobj->end_signal_score($flds[9]);
308 $predobj->coding_signal_score($flds[10]);
309 $predobj->significance($flds[11]);
310 $predobj->primary_tag($ExonTags{$flds[1]} . 'Exon');
311 $predobj->is_coding(1);
312 # Figure out the frame of this exon. This is NOT the frame
313 # given by Genscan, which is the absolute frame of the base
314 # starting the first predicted complete codon. By comparing
315 # to the absolute frame of the first base we can compute the
316 # offset of the first complete codon to the first base of the
317 # exon, which determines the frame of the exon.
319 if($predobj->strand() == 1) {
320 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
321 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
322 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
323 $cod_offset += 3 if($cod_offset < 1);
325 # On the reverse strand the Genscan frame also refers to
326 # the first base of the first complete codon, but viewed
327 # from forward, which is the third base viewed from
329 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
330 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
331 # situation, {2,-1} and {1,-2} correspond to offsets
332 # 1 and 2, resp. Offset 3 is the same as 0.
333 $cod_offset -= 3 if($cod_offset >= 0);
334 $cod_offset = -$cod_offset;
336 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
337 # is the frame of the first base relative to the exon, or the
338 # number of bases the first codon is missing).
339 $predobj->frame(3 - $cod_offset);
340 # then add to gene structure object
341 $gene->add_exon($predobj, $ExonTags{$flds[1]});
342 } elsif($flds[1] eq 'PlyA') {
343 $predobj->primary_tag("PolyAsite");
344 $gene->poly_A_site($predobj);
345 } elsif($flds[1] eq 'Prom') {
346 $predobj->primary_tag("Promoter");
347 $gene->add_promoter($predobj);
351 if(/^\s*$/ && defined($gene)) {
352 # current gene is completed
353 $gene->seq_id($seqname);
354 $self->_add_prediction($gene);
358 if(/^(GENSCAN)\s+(\S+)/) {
359 $self->analysis_method($1);
360 $self->analysis_method_version($2);
363 if(/^Sequence\s+(\S+)\s*:/) {
368 if(/^Parameter matrix:\s+(\S+)/i) {
369 $self->analysis_subject($1);
373 if(/^Predicted coding/) {
378 # section of predicted sequences
379 $self->_pushback($_);
383 $self->_predictions_parsed(1);
388 Title : _prediction()
389 Usage : $gene = $obj->_prediction()
399 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
400 return shift(@
{$self->{'_preds'}});
403 =head2 _add_prediction
405 Title : _add_prediction()
406 Usage : $obj->_add_prediction($gene)
413 sub _add_prediction
{
414 my ($self, $gene) = @_;
416 if(! exists($self->{'_preds'})) {
417 $self->{'_preds'} = [];
419 push(@
{$self->{'_preds'}}, $gene);
422 =head2 _predictions_parsed
424 Title : _predictions_parsed
425 Usage : $obj->_predictions_parsed
428 Returns : TRUE or FALSE
432 sub _predictions_parsed
{
433 my ($self, $val) = @_;
435 $self->{'_preds_parsed'} = $val if $val;
436 if(! exists($self->{'_preds_parsed'})) {
437 $self->{'_preds_parsed'} = 0;
439 return $self->{'_preds_parsed'};
445 Usage : $obj->_has_cds()
446 Function: Whether or not the result contains the predicted CDSs, too.
448 Returns : TRUE or FALSE
453 my ($self, $val) = @_;
455 $self->{'_has_cds'} = $val if $val;
456 if(! exists($self->{'_has_cds'})) {
457 $self->{'_has_cds'} = 0;
459 return $self->{'_has_cds'};
462 =head2 _read_fasta_seq
464 Title : _read_fasta_seq()
465 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
466 Function: Simple but specialised FASTA format sequence reader. Uses
467 $self->_readline() to retrieve input, and is able to strip off
468 the traling description lines.
470 Returns : An array of two elements.
474 sub _read_fasta_seq
{
479 my $entry = $self->_readline();
482 # complete the entry if the first line came from a pushback buffer
483 while($entry !~ />$/) {
484 last unless $_ = $self->_readline();
487 # delete everything onwards from an intervening empty line (at the
488 # end there might be statistics stuff)
489 $entry =~ s/\n\n.*$//s;
491 if($entry =~ /^(\S+)\n([^>]+)/) {
495 $self->throw("Can't parse Genscan predicted sequence entry");
497 $seq =~ s/\s//g; # Remove whitespace