small update
[bioperl-live.git] / Bio / Tools / Genemark.pm
blobbd78e687321a39353fc96fe60431947687ae8369
1 # $Id$
3 # BioPerl module for Bio::Tools::Genemark
5 # Cared for by Mark Fiers <hlapp@gmx.net>
7 # Copyright Hilmar Lapp, Mark Fiers
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::Genemark - Results of one Genemark run
17 =head1 SYNOPSIS
19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
20 # filehandle:
21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
23 # parse the results
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
26 while($gene = $Genemark->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
28 # off Bio::SeqFeature::Gene::Transcript.
30 # $gene->exons() returns an array of
31 # Bio::Tools::Prediction::Exon objects
32 # all exons:
33 @exon_arr = $gene->exons();
35 # initial exons only
36 @init_exons = $gene->exons('Initial');
37 # internal exons only
38 @intrl_exons = $gene->exons('Internal');
39 # terminal exons only
40 @term_exons = $gene->exons('Terminal');
41 # singleton exons:
42 ($single_exon) = $gene->exons();
45 # essential if you gave a filename at initialization (otherwise the file
46 # will stay open)
47 $Genemark->close();
49 =head1 DESCRIPTION
51 The Genemark module provides a parser for Genemark gene structure
52 prediction output. It parses one gene prediction into a
53 Bio::SeqFeature::Gene::Transcript- derived object.
55 This module has been developed around genemark.hmm for eukaryots v2.2a
56 and will probably not work with other versions.
59 This module also implements the Bio::SeqAnalysisParserI interface, and
60 thus can be used wherever such an object fits. See
61 L<Bio::SeqAnalysisParserI>.
63 =head1 FEEDBACK
65 =head2 Mailing Lists
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 =head2 Reporting Bugs
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 the bugs and their resolution. Bug reports can be submitted via the
78 web:
80 http://bugzilla.open-bio.org/
82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
84 Email hlapp@gmx.net
85 m.w.e.j.fiers@plant.wag-ur.nl
87 =head1 APPENDIX
89 The rest of the documentation details each of the object
90 methods. Internal methods are usually preceded with a _
92 =cut
95 # Let the code begin...
98 package Bio::Tools::Genemark;
99 use strict;
100 use Symbol;
102 use Bio::Root::Root;
103 use Bio::Tools::Prediction::Gene;
104 use Bio::Tools::Prediction::Exon;
105 use Bio::Seq;
106 use Bio::Factory::FTLocationFactory;
108 use base qw(Bio::Tools::AnalysisResult);
110 =head2 new
112 Title : new
113 Usage : my $obj = Bio::Tools::Genemark->new();
114 Function: Builds a new Bio::Tools::Genemark object
115 Returns : an instance of Bio::Tools::Genemark
116 Args : seqname
119 =cut
121 sub new {
122 my($class,@args) = @_;
124 my $self = $class->SUPER::new(@args);
126 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
128 # hardwire seq_id when creating gene and exon objects
129 $self->_seqname($seqname) if defined($seqname);
131 return $self;
134 sub _initialize_state {
135 my ($self,@args) = @_;
137 # first call the inherited method!
138 $self->SUPER::_initialize_state(@args);
140 # our private state variables
141 $self->{'_preds_parsed'} = 0;
142 $self->{'_has_cds'} = 0;
143 # array of pre-parsed predictions
144 $self->{'_preds'} = [];
145 # seq stack
146 $self->{'_seqstack'} = [];
149 =head2 analysis_method
151 Usage : $Genemark->analysis_method();
152 Purpose : Inherited method. Overridden to ensure that the name matches
153 /GeneMark.hmm/i.
154 Returns : String
155 Argument : n/a
157 =cut
159 #-------------
160 sub analysis_method {
161 #-------------
162 my ($self, $method) = @_;
163 if($method && ($method !~ /Genemark\.hmm/i)) {
164 $self->throw("method $method not supported in " . ref($self));
166 return $self->SUPER::analysis_method($method);
169 =head2 next_feature
171 Title : next_feature
172 Usage : while($gene = $Genemark->next_feature()) {
173 # do something
175 Function: Returns the next gene structure prediction of the Genemark result
176 file. Call this method repeatedly until FALSE is returned.
178 The returned object is actually a SeqFeatureI implementing object.
179 This method is required for classes implementing the
180 SeqAnalysisParserI interface, and is merely an alias for
181 next_prediction() at present.
183 Example :
184 Returns : A Bio::Tools::Prediction::Gene object.
185 Args :
187 =cut
189 sub next_feature {
190 my ($self,@args) = @_;
191 # even though next_prediction doesn't expect any args (and this method
192 # does neither), we pass on args in order to be prepared if this changes
193 # ever
194 return $self->next_prediction(@args);
197 =head2 next_prediction
199 Title : next_prediction
200 Usage : while($gene = $Genemark->next_prediction()) {
201 # do something
203 Function: Returns the next gene structure prediction of the Genemark result
204 file. Call this method repeatedly until FALSE is returned.
206 Example :
207 Returns : A Bio::Tools::Prediction::Gene object.
208 Args :
210 =cut
212 sub next_prediction {
213 my ($self) = @_;
214 my $gene;
216 # if the prediction section hasn't been parsed yet, we do this now
217 $self->_parse_predictions() unless $self->_predictions_parsed();
219 # get next gene structure
220 $gene = $self->_prediction();
222 return $gene;
225 =head2 _parse_predictions
227 Title : _parse_predictions()
228 Usage : $obj->_parse_predictions()
229 Function: Parses the prediction section. Automatically called by
230 next_prediction() if not yet done.
231 Example :
232 Returns :
234 =cut
236 sub _parse_predictions {
237 my ($self) = @_;
238 my %exontags = ('Initial' => 'Initial',
239 'Internal' => 'Internal',
240 'Terminal' => 'Terminal',
241 'Single' => '',
242 '_na_' => '');
243 my $exontag;
244 my $gene;
245 my $exontype;
246 my $current_gene_no = -1;
248 # The prediction report does not contain a sequence identifier
249 # (at least the prokaryotic version doesn't)
250 my $seqname = $self->_seqname();
252 while(defined($_ = $self->_readline())) {
254 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
256 # this is an exon, Genemark doesn't predict anything else
257 # $prednr corresponds to geneno.
258 my $prednr = $1;
260 #exon no:
261 my $signalnr = 0;
262 if ($2) { my $signalnr = $2; } # used in tag: exon_no
264 # split into fields
265 chomp();
266 my @flds = split(' ', $_);
268 # create the feature (an exon) object
269 my $predobj = Bio::Tools::Prediction::Exon->new();
272 # define info depending on it being eu- or prokaryot
273 my ($start, $end, $orientation, $prediction_source);
275 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
276 $prediction_source = "Genemark.hmm.pro";
277 $orientation = ($flds[1] eq '+') ? 1 : -1;
278 ($start, $end) = @flds[(2,3)];
279 $exontag = "_na_";
281 } else {
282 $prediction_source = "Genemark.hmm.eu";
283 $orientation = ($flds[2] eq '+') ? 1 : -1;
284 ($start, $end) = @flds[(4,5)];
285 $exontag = $flds[3];
288 # instatiate a location object via
289 # Bio::Factory::FTLocationFactory (to handle
290 # inexact coordinates)
291 my $location_string = join('..', $start, $end);
292 my $location_factory = Bio::Factory::FTLocationFactory->new();
293 my $location_obj = $location_factory->from_string($location_string);
294 $predobj->location($location_obj);
296 #store the data in the exon object
297 $predobj->source_tag($prediction_source);
298 $predobj->strand($orientation);
300 $predobj->primary_tag($exontags{$exontag} . "Exon");
302 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
304 $predobj->is_coding(1);
306 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
308 # frame calculation as in the genscan module
309 # is to be implemented...
311 #If the $prednr is not equal to the current gene, we
312 #need to make a new gene and close the old one
313 if($prednr != $current_gene_no) {
314 # a new gene, store the old one if it exists
315 if (defined ($gene)) {
316 $gene->seq_id($seqname);
317 $gene = undef ;
319 #and make a new one
320 $gene = Bio::Tools::Prediction::Gene->new
322 '-primary' => "GenePrediction$prednr",
323 '-source' => $prediction_source);
324 $self->_add_prediction($gene);
325 $current_gene_no = $prednr;
326 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
329 # Add the exon to the gene
330 $gene->add_exon($predobj, ($exontag eq "_na_" ?
331 undef : $exontags{$exontag}));
335 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
336 $self->analysis_method($1);
338 my $gm_version = $2;
340 $self->analysis_method_version($gm_version);
341 next;
344 #Matrix file for eukaryot version
345 if (/^Matrices file:\s+(\S+)?/i) {
346 $self->analysis_subject($1);
347 # since the line after the matrix file is always the date
348 # (in the output file's I have seen!) extract and store this
349 # here
350 if (defined(my $_date = $self->_readline())) {
351 chomp ($_date);
352 $self->analysis_date($_date);
356 #Matrix file for prokaryot version
357 if (/^Model file name:\s+(\S+)/) {
358 $self->analysis_subject($1);
359 # since the line after the matrix file is always the date
360 # (in the output file's I have seen!) extract and store this
361 # here
362 my $_date = $self->_readline() ;
363 if (defined($_date = $self->_readline())) {
364 chomp ($_date);
365 $self->analysis_date($_date);
369 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
370 $seqname = $1;
371 # $self->analysis_subject($seqname);
372 next;
376 /^>/ && do {
377 $self->_pushback($_);
379 # section of predicted aa sequences on recognition
380 # of a fasta start, read all sequences and find the
381 # appropriate gene
382 while (1) {
383 my ($aa_id, $seq) = $self->_read_fasta_seq();
384 last unless ($aa_id);
386 #now parse through the predictions to add the pred. protein
387 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
388 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
389 my $geneno = $1;
390 if ($aa_id =~ /\|gene.$geneno\|/) {
391 #print "x SEQ : \n $seq \nXXXX\n";
392 my $seqobj = Bio::Seq->new('-seq' => $seq,
393 '-display_id' => $aa_id,
394 '-alphabet' => "protein");
395 $gene->predicted_protein($seqobj);
396 last FINDPRED;
402 last;
406 # if the analysis query object contains a ref to a Seq of PrimarySeq
407 # object, then extract the predicted sequences and add it to the gene
408 # object.
409 if (defined $self->analysis_query()) {
410 my $orig_seq = $self->analysis_query();
411 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
412 my $predseq = "";
413 foreach my $exon ($gene->exons()) {
414 #print $exon->start() . " " . $exon->end () . "\n";
415 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
418 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
419 '-display_id' => "transl");
420 $gene->predicted_cds($seqobj);
425 $self->_predictions_parsed(1);
428 =head2 _prediction
430 Title : _prediction()
431 Usage : $gene = $obj->_prediction()
432 Function: internal
433 Example :
434 Returns :
436 =cut
438 sub _prediction {
439 my ($self) = @_;
441 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
442 return shift(@{$self->{'_preds'}});
445 =head2 _add_prediction
447 Title : _add_prediction()
448 Usage : $obj->_add_prediction($gene)
449 Function: internal
450 Example :
451 Returns :
453 =cut
455 sub _add_prediction {
456 my ($self, $gene) = @_;
458 if(! exists($self->{'_preds'})) {
459 $self->{'_preds'} = [];
461 push(@{$self->{'_preds'}}, $gene);
464 =head2 _predictions_parsed
466 Title : _predictions_parsed
467 Usage : $obj->_predictions_parsed
468 Function: internal
469 Example :
470 Returns : TRUE or FALSE
472 =cut
474 sub _predictions_parsed {
475 my ($self, $val) = @_;
477 $self->{'_preds_parsed'} = $val if $val;
478 if(! exists($self->{'_preds_parsed'})) {
479 $self->{'_preds_parsed'} = 0;
481 return $self->{'_preds_parsed'};
484 =head2 _has_cds
486 Title : _has_cds()
487 Usage : $obj->_has_cds()
488 Function: Whether or not the result contains the predicted CDSs, too.
489 Example :
490 Returns : TRUE or FALSE
492 =cut
494 sub _has_cds {
495 my ($self, $val) = @_;
497 $self->{'_has_cds'} = $val if $val;
498 if(! exists($self->{'_has_cds'})) {
499 $self->{'_has_cds'} = 0;
501 return $self->{'_has_cds'};
504 =head2 _read_fasta_seq
506 Title : _read_fasta_seq()
507 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
508 Function: Simple but specialised FASTA format sequence reader. Uses
509 $self->_readline() to retrieve input, and is able to strip off
510 the traling description lines.
511 Example :
512 Returns : An array of two elements.
514 =cut
516 sub _read_fasta_seq {
517 my ($self) = @_;
518 my ($id, $seq);
519 local $/ = ">";
521 return 0 unless (my $entry = $self->_readline());
523 $entry =~ s/^>//;
524 # complete the entry if the first line came from a pushback buffer
525 while(! ($entry =~ />$/)) {
526 last unless ($_ = $self->_readline());
527 $entry .= $_;
530 # delete everything onwards from an new fasta start (>)
531 $entry =~ s/\n>.*$//s;
532 # id and sequence
534 if($entry =~ s/^(.+)\n//) {
535 $id = $1;
536 $id =~ s/ /_/g;
537 $seq = $entry;
538 $seq =~ s/\s//g;
539 #print "\n@@ $id \n@@ $seq \n##\n";
540 } else {
541 $self->throw("Can't parse Genemark predicted sequence entry");
543 $seq =~ s/\s//g; # Remove whitespace
544 return ($id, $seq);
547 =head2 _seqname
549 Title : _seqname
550 Usage : $obj->_seqname($seqname)
551 Function: internal
552 Example :
553 Returns : String
555 =cut
557 sub _seqname {
558 my ($self, $val) = @_;
560 $self->{'_seqname'} = $val if $val;
561 if(! exists($self->{'_seqname'})) {
562 $self->{'_seqname'} = 'unknown';
564 return $self->{'_seqname'};