3 # BioPerl module for Bio::Tools::Glimmer
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Glimmer - parser for Glimmer 2.X/3.X prokaryotic and
18 GlimmerM/GlimmerHMM eukaryotic gene predictions
22 use Bio::Tools::Glimmer;
25 my $parser = Bio::Tools::Glimmer->new(-file => $file);
27 $parser = Bio::Tools::Glimmer->new( -fh => \*INPUT );
28 # provide a sequence identifier (Glimmer 2.X)
29 my $parser = Bio::Tools::Glimmer->new(-file => $file, -seqname => seqname);
30 # force format (override automatic detection)
31 my $parser = Bio::Tools::Glimmer->new(-file => $file, -format => 'GlimmerM');
34 # note: this class is-a Bio::Tools::AnalysisResult which implements
35 # Bio::SeqAnalysisParserI, i.e., $glimmer->next_feature() is the same
37 while(my $gene = $parser->next_prediction()) {
38 # For eukaryotic input (GlimmerM/GlimmerHMM), $gene will be an instance
39 # of Bio::Tools::Prediction::Gene, which inherits off
40 # Bio::SeqFeature::Gene::Transcript, and $gene->exons() will return an
41 # array of Bio::Tools::Prediction::Exon objects.
42 # For prokaryotic input (Glimmer2.X/Glimmer3.X), $gene will be an
43 # instance of Bio::SeqFeature::Generic
45 # all exons (eukaryotic only):
46 @exon_arr = $gene->exons();
48 @init_exons = $gene->exons('Initial');
50 @intrl_exons = $gene->exons('Internal');
52 @term_exons = $gene->exons('Terminal');
57 This is a module for parsing Glimmer, GlimmerM and GlimmerHMM predictions.
58 It will create gene objects from the prediction report which can
59 be attached to a sequence using Bioperl objects, or output as GFF
60 suitable for loading into Bio::DB::GFF for use with Gbrowse.
62 Glimmer is open source and available at
63 L<http://www.cbcb.umd.edu/software/glimmer/>.
65 GlimmerM is open source and available at
66 L<http://www.tigr.org/software/glimmerm/>.
68 GlimmerHMM is open source and available at
69 L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
71 Note that Glimmer 2.X will only process the first
72 sequence in a fasta file, and the prediction report does not contain any
73 sort of sequence identifier
75 Note that Glimmer 3.X produces two output files. This module only parses
83 User feedback is an integral part of the evolution of this and other
84 Bioperl modules. Send your comments and suggestions preferably to
85 the Bioperl mailing list. Your participation is much appreciated.
87 bioperl-l@bioperl.org - General discussion
88 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
92 Please direct usage questions or support issues to the mailing list:
94 L<bioperl-l@bioperl.org>
96 rather than to the module maintainer directly. Many experienced and
97 reponsive experts will be able look at the problem and quickly
98 address it. Please include a thorough description of the problem
99 with code and data examples if at all possible.
101 =head2 Reporting Bugs
103 Report bugs to the Bioperl bug tracking system to help us keep track
104 of the bugs and their resolution. Bug reports can be submitted via
107 http://bugzilla.open-bio.org/
109 =head1 AUTHOR - Jason Stajich
111 Email jason-at-bioperl-dot-org
121 The rest of the documentation details each of the object methods.
122 Internal methods are usually preceded with a _
127 # Let the code begin...
130 package Bio
::Tools
::Glimmer
;
133 use Bio
::Factory
::FTLocationFactory
;
134 use Bio
::Tools
::Prediction
::Gene
;
135 use Bio
::Tools
::Prediction
::Exon
;
137 use base
qw(Bio::Tools::AnalysisResult);
139 sub _initialize_state
{
140 my($self,@args) = @_;
142 # first call the inherited method!
143 my $make = $self->SUPER::_initialize_state
(@args);
145 $self->{'_preds_parsed'} = 0;
146 # array of pre-parsed predictions
147 $self->{'_preds'} = [];
153 Usage : my $obj = Bio::Tools::Glimmer->new();
154 Function: Builds a new Bio::Tools::Glimmer object
155 Returns : an instance of Bio::Tools::Glimmer
156 Args : format ('Glimmer', 'GlimmerM', 'GlimmerHMM'), seqname
162 my($class,@args) = @_;
164 my $self = $class->SUPER::new
(@args);
166 my ($format, $seqname, $seqlength, $detail) =
167 $self->_rearrange([qw(FORMAT SEQNAME SEQLENGTH DETAIL)], @args);
169 # override automagic format detection
170 if (defined($format) &&
171 (($format eq 'Glimmer') ||
172 ($format eq 'GlimmerM') ||
173 ($format eq 'GlimmerHMM'))
175 $self->_format($format);
178 if (defined($detail)) {
179 $self->_format('Glimmer');
180 $self->_detail_file($detail);
183 # hardwire seq_id when creating gene and exon objects (Glimmer 2.X)
184 $self->_seqname($seqname) if defined($seqname);
186 # store the length of the input sequence (Glimmer 2.X)
187 $self->_seqlength($seqlength) if defined($seqlength);
192 =head2 analysis_method
194 Usage : $glimmer->analysis_method();
195 Purpose : Inherited method. Overridden to ensure that the name matches
203 sub analysis_method
{
205 my ($self, $method) = @_;
206 if($method && ($method !~ /glimmer/i)) {
207 $self->throw("method $method not supported in " . ref($self));
209 return $self->SUPER::analysis_method
($method);
215 Usage : while($gene = $glimmer->next_feature()) {
218 Function: Returns the next gene structure prediction of the Glimmer result
219 file. Call this method repeatedly until FALSE is returned.
221 The returned object is actually a SeqFeatureI implementing object.
222 This method is required for classes implementing the
223 SeqAnalysisParserI interface, and is merely an alias for
224 next_prediction() at present.
227 Returns : A Bio::Tools::Prediction::Gene object.
233 my ($self,@args) = @_;
234 # even though next_prediction doesn't expect any args (and this method
235 # does neither), we pass on args in order to be prepared if this changes
237 return $self->next_prediction(@args);
240 =head2 next_prediction
242 Title : next_prediction
243 Usage : while($gene = $glimmer->next_prediction()) {
246 Function: Returns the next gene structure prediction of the Glimmer result
247 file. Call this method repeatedly until FALSE is returned.
250 Returns : A Bio::Tools::Prediction::Gene object.
255 sub next_prediction
{
259 # if the prediction section hasn't been parsed yet, we do this now
260 $self->_parse_predictions() unless $self->_predictions_parsed();
262 # get next gene structure
263 $gene = $self->_prediction();
267 =head2 _parse_predictions
269 Title : _parse_predictions()
270 Usage : $obj->_parse_predictions()
271 Function: Parses the prediction section. Automatically called by
272 next_prediction() if not yet done.
278 sub _parse_predictions
{
284 'Glimmer' => '_parse_prokaryotic',
285 'GlimmerM' => '_parse_eukaryotic',
286 'GlimmerHMM' => '_parse_eukaryotic',
287 '_DEFAULT_' => '_parse_eukaryotic',
290 my $format = $self->_format();
294 while (my $line = $self->_readline()) {
296 if ( $line =~ /^Glimmer\S*\s+\(Version\s*\S+\)/ ) {
297 $format = 'GlimmerM';
298 $self->_pushback($line);
301 elsif ( $line =~ /^Glimmer\S*$/ ) {
302 $format = 'GlimmerHMM';
303 $self->_pushback($line);
306 elsif ($line =~ /^Putative Genes:$/) {
308 $self->_pushback($line);
311 elsif ($line =~ /^>(\S+)/) {
313 $self->_pushback($line);
322 (exists($method{$format})) ?
$method{$format} : $method{'_DEFAULT_'};
324 return $self->$method();
329 =head2 _parse_eukaryotic
331 Title : _parse_eukaryotic()
332 Usage : $obj->_parse_eukaryotic()
333 Function: Parses the prediction section. Automatically called by
334 next_prediction() if not yet done.
340 sub _parse_eukaryotic
{
343 my ($gene,$seqname,$seqlen,$source,$lastgenenum);
345 while(defined($_ = $self->_readline())) {
346 if( /^(Glimmer\S*)\s+\(Version\s*(\S+)\)/ ) {
349 } elsif( /^(GlimmerHMM\S*)$/ ) { # GlimmerHMM has no version
352 } elsif(/^Sequence name:\s+(.+)$/ ) {
355 } elsif( /^Sequence length:\s+(\S+)/ ) {
358 } elsif( m/^(Predicted genes)|(Gene)|\s+\#/ || /^\s+$/ ) {
361 } elsif( # GlimmerM/HMM gene-exon prediction line
362 /^\s
+(\d
+)\s
+ # gene num
366 (\d
+)\s
+(\d
+) # exon start, end
367 \s
+(\d
+) # exon length
369 my ($genenum,$exonnum,$strand,$type,$start,$end,$len) =
370 ( $1,$2,$3,$4,$5,$6,$7);
371 if( ! $lastgenenum || $lastgenenum != $genenum) {
372 $self->_add_prediction($gene) if ( $gene );
373 $gene = Bio
::Tools
::Prediction
::Gene
->new
375 '-seq_id' => $seqname,
376 '-primary_tag' => "gene",
377 '-source_tag' => $source,
378 '-tag' => { 'Group' => "GenePrediction$genenum"},
381 my $exon = Bio
::Tools
::Prediction
::Exon
->new
382 ('-seq_id' => $seqname,
385 '-strand' => $strand eq '-' ?
'-1' : '1',
386 '-source_tag' => $source,
387 '-primary_tag'=> 'exon',
388 '-tag' => { 'Group' => "GenePrediction$genenum"},
390 $gene->add_exon($exon,lc($type));
391 $lastgenenum = $genenum;
394 $self->_add_prediction($gene) if( $gene );
395 $self->_predictions_parsed(1);
398 =head2 _parse_prokaryotic
400 Title : _parse_prokaryotic()
401 Usage : $obj->_parse_prokaryotic()
402 Function: Parses the prediction section. Automatically called by
403 next_prediction() if not yet done.
409 sub _parse_prokaryotic
{
412 # default value, possibly overriden later
413 my $source = 'Glimmer';
415 # Store the sequence length(s) here, either from the
416 # seqlength arg to the constructor, or from the
417 # Glimmer 3.X detail file
420 # Glimmer 2.X does not provide a sequence identifer
421 # in the prediction report (will default to unknown
422 # if not specified in the seqname arg to the
425 # Glimmer 2.X does not report the length of the
426 # input sequence, either (will default to undef
427 # if not specified in the seqlength arg to the
429 my $seqname = $self->_seqname();
430 my $seqlength = $self->_seqlength();
432 if (defined($seqlength)) {
433 $seqlength{$seqname} = $seqlength
436 # Parse the detail file, if we have one (Glimmer 3.X)
437 my $detail_file = $self->_detail_file();
439 if (defined($detail_file)) {
441 my $io = Bio
::Root
::IO
->new(-file
=> $detail_file);
444 while (defined($_ = $io->_readline())) {
445 if ($_ =~ /^>(\S+)/) {
450 if (defined($seqname) && ($_ =~ /^Sequence length = (\d+)$/)) {
451 $seqlength{$seqname} = $1;
457 my $location_factory = Bio
::Factory
::FTLocationFactory
->new();
459 while(defined($_ = $self->_readline())) {
460 # Glimmer 3.X does provide a sequence identifier -
461 # beware whitespace at the end (comes through from
463 if ($_ =~ /^Putative Genes:$/) {
464 $source = 'Glimmer_2.X';
467 # Glimmer 3.X sequence identifier
468 elsif ($_ =~ /^>(\S+)/) {
470 $seqlength = $seqlength{$seqname};
471 $source = 'Glimmer_3.X';
475 # Glimmer 2.X prediction
476 (/^\s
+(\d
+)\s
+ # gene num
477 (\d
+)\s
+(\d
+)\s
+ # start, end
478 \
[([\
+\
-])(\d
{1})\s
+ # strand, frame
480 # Glimmer 3.X prediction
481 (/^[^\d
]+(\d
+)\s
+ # orf (numeric portion)
482 (\d
+)\s
+(\d
+)\s
+ # start, end
483 ([\
+\
-])(\d
{1})\s
+ # strand, frame
486 my ($genenum,$start,$end,$strand,$frame,$score) =
487 ( $1,$2,$3,$4,$5,$6 );
489 my $circular_prediction = 0;
491 # Check for a circular prediction before we
492 # start fiddling with the coordinates
493 if ($strand eq '+') {
495 $circular_prediction = 1;
500 $circular_prediction = 1;
504 if ($circular_prediction) {
505 unless (defined($seqlength)) {
506 $self->throw("need to know the sequence length to handle wraparound genes");
510 # Glimmer 2.X predictions do not include
511 # the stop codon - this might extend the
512 # prediction off either end of the sequence.
513 # This works fine even on circular/wraparound
515 if ($source eq 'Glimmer_2.X') {
516 if ($strand eq '+') {
524 # We might have extended a Glimmer 2.X prediction
525 # beyond the boundaries of the input sequence.
526 # Also, Glimmer 3.X (with -X) will output predictions
527 # with coordinates less than 1 or greater than the
528 # length of the sequence.
530 foreach my $coord ($start, $end) {
534 } elsif (defined($seqlength) && ($coord > $seqlength)) {
535 $coord = ">$seqlength";
542 if ($circular_prediction) {
543 if ($strand eq '+') {
544 $location_string = "join($start..$seqlength,1..$end)";
547 $location_string = "join($start..1,$seqlength..$end)";
551 # start must always be less than end for gene locations
552 if ($strand eq '-' && !$fst && !$fend && $start > $end) {
553 ($start, $end) = ($end, $start);
555 $location_string = "$start..$end";
558 my $location_object =
559 $location_factory->from_string($location_string);
561 # convert glimmer's frame range from 1-3 to SeqFeature's 0-2.
564 my $gene = Bio
::SeqFeature
::Generic
->new
566 '-seq_id' => $seqname,
567 '-location' => $location_object,
568 '-strand' => $strand eq '-' ?
'-1' : '1',
570 '-source_tag' => $source,
571 '-display_name' => "orf$genenum",
572 '-primary_tag'=> 'gene',
573 '-tag' => { 'Group' => "GenePrediction_$genenum"},
574 '-score' => $score || undef
577 $self->_add_prediction($gene)
581 $self->_predictions_parsed(1);
586 Title : _prediction()
587 Usage : $gene = $obj->_prediction()
597 return unless(exists($self->{'_preds'}) && @
{$self->{'_preds'}});
598 return shift(@
{$self->{'_preds'}});
601 =head2 _add_prediction
603 Title : _add_prediction()
604 Usage : $obj->_add_prediction($gene)
611 sub _add_prediction
{
612 my ($self, $gene) = @_;
614 if(! exists($self->{'_preds'})) {
615 $self->{'_preds'} = [];
617 push(@
{$self->{'_preds'}}, $gene);
620 =head2 _predictions_parsed
622 Title : _predictions_parsed
623 Usage : $obj->_predictions_parsed
626 Returns : TRUE or FALSE
630 sub _predictions_parsed
{
631 my ($self, $val) = @_;
633 $self->{'_preds_parsed'} = $val if $val;
634 if(! exists($self->{'_preds_parsed'})) {
635 $self->{'_preds_parsed'} = 0;
637 return $self->{'_preds_parsed'};
643 Usage : $obj->_seqname($seqname)
644 Function: internal (for Glimmer 2.X)
651 my ($self, $val) = @_;
653 $self->{'_seqname'} = $val if $val;
654 if(! exists($self->{'_seqname'})) {
655 $self->{'_seqname'} = 'unknown';
657 return $self->{'_seqname'};
663 Usage : $obj->_seqlength($seqlength)
664 Function: internal (for Glimmer 2.X)
671 my ($self, $val) = @_;
673 $self->{'_seqlength'} = $val if $val;
674 return $self->{'_seqlength'};
680 Usage : $obj->_format($format)
688 my ($self, $val) = @_;
690 $self->{'_format'} = $val if $val;
692 return $self->{'_format'};
698 Usage : $obj->_detail_file($filename)
699 Function: internal (for Glimmer 3.X)
706 my ($self, $val) = @_;
708 $self->{'_detail_file'} = $val if $val;
709 return $self->{'_detail_file'};