3 # BioPerl module for Bio::Tools::Genemark
5 # Cared for by Mark Fiers <hlapp@gmx.net>
7 # Copyright Hilmar Lapp, Mark Fiers
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::Genemark - Results of one Genemark run
19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
26 while($gene = $Genemark->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
28 # off Bio::SeqFeature::Gene::Transcript.
30 # $gene->exons() returns an array of
31 # Bio::Tools::Prediction::Exon objects
33 @exon_arr = $gene->exons();
36 @init_exons = $gene->exons('Initial');
38 @intrl_exons = $gene->exons('Internal');
40 @term_exons = $gene->exons('Terminal');
42 ($single_exon) = $gene->exons();
45 # essential if you gave a filename at initialization (otherwise the file
51 The Genemark module provides a parser for Genemark gene structure
52 prediction output. It parses one gene prediction into a
53 Bio::SeqFeature::Gene::Transcript- derived object.
55 This module has been developed around genemark.hmm for eukaryots v2.2a
56 and will probably not work with other versions.
59 This module also implements the Bio::SeqAnalysisParserI interface, and
60 thus can be used wherever such an object fits. See
61 L<Bio::SeqAnalysisParserI>.
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to one
69 of the Bioperl mailing lists. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 the bugs and their resolution. Bug reports can be submitted via the
80 http://bugzilla.open-bio.org/
82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
85 m.w.e.j.fiers@plant.wag-ur.nl
89 The rest of the documentation details each of the object
90 methods. Internal methods are usually preceded with a _
95 # Let the code begin...
98 package Bio
::Tools
::Genemark
;
103 use Bio
::Tools
::Prediction
::Gene
;
104 use Bio
::Tools
::Prediction
::Exon
;
106 use Bio
::Factory
::FTLocationFactory
;
108 use base
qw(Bio::Tools::AnalysisResult);
113 Usage : my $obj = Bio::Tools::Genemark->new();
114 Function: Builds a new Bio::Tools::Genemark object
115 Returns : an instance of Bio::Tools::Genemark
122 my($class,@args) = @_;
124 my $self = $class->SUPER::new
(@args);
126 my ($seqname) = $self->_rearrange([qw(SEQNAME)], @args);
128 # hardwire seq_id when creating gene and exon objects
129 $self->_seqname($seqname) if defined($seqname);
134 sub _initialize_state
{
135 my ($self,@args) = @_;
137 # first call the inherited method!
138 $self->SUPER::_initialize_state
(@args);
140 # our private state variables
141 $self->{'_preds_parsed'} = 0;
142 $self->{'_has_cds'} = 0;
143 # array of pre-parsed predictions
144 $self->{'_preds'} = [];
146 $self->{'_seqstack'} = [];
149 =head2 analysis_method
151 Usage : $Genemark->analysis_method();
152 Purpose : Inherited method. Overridden to ensure that the name matches
160 sub analysis_method
{
162 my ($self, $method) = @_;
163 if($method && ($method !~ /Genemark\.hmm/i)) {
164 $self->throw("method $method not supported in " . ref($self));
166 return $self->SUPER::analysis_method
($method);
172 Usage : while($gene = $Genemark->next_feature()) {
175 Function: Returns the next gene structure prediction of the Genemark result
176 file. Call this method repeatedly until FALSE is returned.
178 The returned object is actually a SeqFeatureI implementing object.
179 This method is required for classes implementing the
180 SeqAnalysisParserI interface, and is merely an alias for
181 next_prediction() at present.
184 Returns : A Bio::Tools::Prediction::Gene object.
190 my ($self,@args) = @_;
191 # even though next_prediction doesn't expect any args (and this method
192 # does neither), we pass on args in order to be prepared if this changes
194 return $self->next_prediction(@args);
197 =head2 next_prediction
199 Title : next_prediction
200 Usage : while($gene = $Genemark->next_prediction()) {
203 Function: Returns the next gene structure prediction of the Genemark result
204 file. Call this method repeatedly until FALSE is returned.
207 Returns : A Bio::Tools::Prediction::Gene object.
212 sub next_prediction
{
216 # if the prediction section hasn't been parsed yet, we do this now
217 $self->_parse_predictions() unless $self->_predictions_parsed();
219 # get next gene structure
220 $gene = $self->_prediction();
225 =head2 _parse_predictions
227 Title : _parse_predictions()
228 Usage : $obj->_parse_predictions()
229 Function: Parses the prediction section. Automatically called by
230 next_prediction() if not yet done.
236 sub _parse_predictions
{
238 my %exontags = ('Initial' => 'Initial',
239 'Internal' => 'Internal',
240 'Terminal' => 'Terminal',
246 my $current_gene_no = -1;
248 # The prediction report does not contain a sequence identifier
249 # (at least the prokaryotic version doesn't)
250 my $seqname = $self->_seqname();
252 while(defined($_ = $self->_readline())) {
254 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
256 # this is an exon, Genemark doesn't predict anything else
257 # $prednr corresponds to geneno.
262 if ($2) { my $signalnr = $2; } # used in tag: exon_no
266 my @flds = split(' ', $_);
268 # create the feature (an exon) object
269 my $predobj = Bio
::Tools
::Prediction
::Exon
->new();
272 # define info depending on it being eu- or prokaryot
273 my ($start, $end, $orientation, $prediction_source);
275 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
276 $prediction_source = "Genemark.hmm.pro";
277 $orientation = ($flds[1] eq '+') ?
1 : -1;
278 ($start, $end) = @flds[(2,3)];
282 $prediction_source = "Genemark.hmm.eu";
283 $orientation = ($flds[2] eq '+') ?
1 : -1;
284 ($start, $end) = @flds[(4,5)];
288 # instatiate a location object via
289 # Bio::Factory::FTLocationFactory (to handle
290 # inexact coordinates)
291 my $location_string = join('..', $start, $end);
292 my $location_factory = Bio
::Factory
::FTLocationFactory
->new();
293 my $location_obj = $location_factory->from_string($location_string);
294 $predobj->location($location_obj);
296 #store the data in the exon object
297 $predobj->source_tag($prediction_source);
298 $predobj->strand($orientation);
300 $predobj->primary_tag($exontags{$exontag} . "Exon");
302 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
304 $predobj->is_coding(1);
306 $predobj->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
308 # frame calculation as in the genscan module
309 # is to be implemented...
311 #If the $prednr is not equal to the current gene, we
312 #need to make a new gene and close the old one
313 if($prednr != $current_gene_no) {
314 # a new gene, store the old one if it exists
315 if (defined ($gene)) {
316 $gene->seq_id($seqname);
320 $gene = Bio
::Tools
::Prediction
::Gene
->new
322 '-primary' => "GenePrediction$prednr",
323 '-source' => $prediction_source);
324 $self->_add_prediction($gene);
325 $current_gene_no = $prednr;
326 $gene->seq_id($seqname) if (defined($seqname) && ($seqname ne 'unknown'));
329 # Add the exon to the gene
330 $gene->add_exon($predobj, ($exontag eq "_na_" ?
331 undef : $exontags{$exontag}));
335 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
336 $self->analysis_method($1);
340 $self->analysis_method_version($gm_version);
344 #Matrix file for eukaryot version
345 if (/^Matrices file:\s+(\S+)?/i) {
346 $self->analysis_subject($1);
347 # since the line after the matrix file is always the date
348 # (in the output file's I have seen!) extract and store this
350 if (defined(my $_date = $self->_readline())) {
352 $self->analysis_date($_date);
356 #Matrix file for prokaryot version
357 if (/^Model file name:\s+(\S+)/) {
358 $self->analysis_subject($1);
359 # since the line after the matrix file is always the date
360 # (in the output file's I have seen!) extract and store this
362 my $_date = $self->_readline() ;
363 if (defined($_date = $self->_readline())) {
365 $self->analysis_date($_date);
369 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
371 # $self->analysis_subject($seqname);
377 $self->_pushback($_);
379 # section of predicted aa sequences on recognition
380 # of a fasta start, read all sequences and find the
383 my ($aa_id, $seq) = $self->_read_fasta_seq();
384 last unless ($aa_id);
386 #now parse through the predictions to add the pred. protein
387 FINDPRED
: foreach my $gene (@
{$self->{'_preds'}}) {
388 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
390 if ($aa_id =~ /\|gene.$geneno\|/) {
391 #print "x SEQ : \n $seq \nXXXX\n";
392 my $seqobj = Bio
::Seq
->new('-seq' => $seq,
393 '-display_id' => $aa_id,
394 '-alphabet' => "protein");
395 $gene->predicted_protein($seqobj);
406 # if the analysis query object contains a ref to a Seq of PrimarySeq
407 # object, then extract the predicted sequences and add it to the gene
409 if (defined $self->analysis_query()) {
410 my $orig_seq = $self->analysis_query();
411 FINDPREDSEQ
: foreach my $gene (@
{$self->{'_preds'}}) {
413 foreach my $exon ($gene->exons()) {
414 #print $exon->start() . " " . $exon->end () . "\n";
415 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
418 my $seqobj = Bio
::PrimarySeq
->new('-seq' => $predseq,
419 '-display_id' => "transl");
420 $gene->predicted_cds($seqobj);
425 $self->_predictions_parsed(1);
430 Title : _prediction()
431 Usage : $gene = $obj->_prediction()
441 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
442 return shift(@
{$self->{'_preds'}});
445 =head2 _add_prediction
447 Title : _add_prediction()
448 Usage : $obj->_add_prediction($gene)
455 sub _add_prediction
{
456 my ($self, $gene) = @_;
458 if(! exists($self->{'_preds'})) {
459 $self->{'_preds'} = [];
461 push(@
{$self->{'_preds'}}, $gene);
464 =head2 _predictions_parsed
466 Title : _predictions_parsed
467 Usage : $obj->_predictions_parsed
470 Returns : TRUE or FALSE
474 sub _predictions_parsed
{
475 my ($self, $val) = @_;
477 $self->{'_preds_parsed'} = $val if $val;
478 if(! exists($self->{'_preds_parsed'})) {
479 $self->{'_preds_parsed'} = 0;
481 return $self->{'_preds_parsed'};
487 Usage : $obj->_has_cds()
488 Function: Whether or not the result contains the predicted CDSs, too.
490 Returns : TRUE or FALSE
495 my ($self, $val) = @_;
497 $self->{'_has_cds'} = $val if $val;
498 if(! exists($self->{'_has_cds'})) {
499 $self->{'_has_cds'} = 0;
501 return $self->{'_has_cds'};
504 =head2 _read_fasta_seq
506 Title : _read_fasta_seq()
507 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
508 Function: Simple but specialised FASTA format sequence reader. Uses
509 $self->_readline() to retrieve input, and is able to strip off
510 the traling description lines.
512 Returns : An array of two elements.
516 sub _read_fasta_seq
{
521 return 0 unless (my $entry = $self->_readline());
524 # complete the entry if the first line came from a pushback buffer
525 while(! ($entry =~ />$/)) {
526 last unless ($_ = $self->_readline());
530 # delete everything onwards from an new fasta start (>)
531 $entry =~ s/\n>.*$//s;
534 if($entry =~ s/^(.+)\n//) {
539 #print "\n@@ $id \n@@ $seq \n##\n";
541 $self->throw("Can't parse Genemark predicted sequence entry");
543 $seq =~ s/\s//g; # Remove whitespace
550 Usage : $obj->_seqname($seqname)
558 my ($self, $val) = @_;
560 $self->{'_seqname'} = $val if $val;
561 if(! exists($self->{'_seqname'})) {
562 $self->{'_seqname'} = 'unknown';
564 return $self->{'_seqname'};