sync w/ main trunk
[bioperl-live.git] / Bio / Tools / Glimmer.pm
blob57575ac7a69276acef741509d0736e8c0ede155d
1 # $Id$
3 # BioPerl module for Bio::Tools::Glimmer
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Glimmer - parser for Glimmer 2.X/3.X prokaryotic and
18 GlimmerM/GlimmerHMM eukaryotic gene predictions
20 =head1 SYNOPSIS
22 use Bio::Tools::Glimmer;
24 # file
25 my $parser = Bio::Tools::Glimmer->new(-file => $file);
26 # filehandle:
27 $parser = Bio::Tools::Glimmer->new( -fh => \*INPUT );
28 # provide a sequence identifier (Glimmer 2.X)
29 my $parser = Bio::Tools::Glimmer->new(-file => $file, -seqname => seqname);
30 # force format (override automatic detection)
31 my $parser = Bio::Tools::Glimmer->new(-file => $file, -format => 'GlimmerM');
33 # parse the results
34 # note: this class is-a Bio::Tools::AnalysisResult which implements
35 # Bio::SeqAnalysisParserI, i.e., $glimmer->next_feature() is the same
37 while(my $gene = $parser->next_prediction()) {
38 # For eukaryotic input (GlimmerM/GlimmerHMM), $gene will be an instance
39 # of Bio::Tools::Prediction::Gene, which inherits off
40 # Bio::SeqFeature::Gene::Transcript, and $gene->exons() will return an
41 # array of Bio::Tools::Prediction::Exon objects.
42 # For prokaryotic input (Glimmer2.X/Glimmer3.X), $gene will be an
43 # instance of Bio::SeqFeature::Generic
45 # all exons (eukaryotic only):
46 @exon_arr = $gene->exons();
47 # initial exons only
48 @init_exons = $gene->exons('Initial');
49 # internal exons only
50 @intrl_exons = $gene->exons('Internal');
51 # terminal exons only
52 @term_exons = $gene->exons('Terminal');
55 =head1 DESCRIPTION
57 This is a module for parsing Glimmer, GlimmerM and GlimmerHMM predictions.
58 It will create gene objects from the prediction report which can
59 be attached to a sequence using Bioperl objects, or output as GFF
60 suitable for loading into Bio::DB::GFF for use with Gbrowse.
62 Glimmer is open source and available at
63 L<http://www.cbcb.umd.edu/software/glimmer/>.
65 GlimmerM is open source and available at
66 L<http://www.tigr.org/software/glimmerm/>.
68 GlimmerHMM is open source and available at
69 L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
71 Note that Glimmer 2.X will only process the first
72 sequence in a fasta file, and the prediction report does not contain any
73 sort of sequence identifier
75 Note that Glimmer 3.X produces two output files. This module only parses
76 the .predict file.
79 =head1 FEEDBACK
81 =head2 Mailing Lists
83 User feedback is an integral part of the evolution of this and other
84 Bioperl modules. Send your comments and suggestions preferably to
85 the Bioperl mailing list. Your participation is much appreciated.
87 bioperl-l@bioperl.org - General discussion
88 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
90 =head2 Support
92 Please direct usage questions or support issues to the mailing list:
94 L<bioperl-l@bioperl.org>
96 rather than to the module maintainer directly. Many experienced and
97 reponsive experts will be able look at the problem and quickly
98 address it. Please include a thorough description of the problem
99 with code and data examples if at all possible.
101 =head2 Reporting Bugs
103 Report bugs to the Bioperl bug tracking system to help us keep track
104 of the bugs and their resolution. Bug reports can be submitted via
105 email or the web:
107 http://bugzilla.open-bio.org/
109 =head1 AUTHOR - Jason Stajich
111 Email jason-at-bioperl-dot-org
113 =head1 CONTRIBUTORS
115 Torsten Seemann
117 Mark Johnson
119 =head1 APPENDIX
121 The rest of the documentation details each of the object methods.
122 Internal methods are usually preceded with a _
124 =cut
127 # Let the code begin...
130 package Bio::Tools::Glimmer;
131 use strict;
133 use Bio::Factory::FTLocationFactory;
134 use Bio::Tools::Prediction::Gene;
135 use Bio::Tools::Prediction::Exon;
137 use base qw(Bio::Tools::AnalysisResult);
139 sub _initialize_state {
140 my($self,@args) = @_;
142 # first call the inherited method!
143 my $make = $self->SUPER::_initialize_state(@args);
145 $self->{'_preds_parsed'} = 0;
146 # array of pre-parsed predictions
147 $self->{'_preds'} = [];
150 =head2 new
152 Title : new
153 Usage : my $obj = Bio::Tools::Glimmer->new();
154 Function: Builds a new Bio::Tools::Glimmer object
155 Returns : an instance of Bio::Tools::Glimmer
156 Args : format ('Glimmer', 'GlimmerM', 'GlimmerHMM'), seqname
159 =cut
161 sub new {
162 my($class,@args) = @_;
164 my $self = $class->SUPER::new(@args);
166 my ($format, $seqname, $seqlength, $detail) =
167 $self->_rearrange([qw(FORMAT SEQNAME SEQLENGTH DETAIL)], @args);
169 # override automagic format detection
170 if (defined($format) &&
171 (($format eq 'Glimmer') ||
172 ($format eq 'GlimmerM') ||
173 ($format eq 'GlimmerHMM'))
175 $self->_format($format);
178 if (defined($detail)) {
179 $self->_format('Glimmer');
180 $self->_detail_file($detail);
183 # hardwire seq_id when creating gene and exon objects (Glimmer 2.X)
184 $self->_seqname($seqname) if defined($seqname);
186 # store the length of the input sequence (Glimmer 2.X)
187 $self->_seqlength($seqlength) if defined($seqlength);
189 return $self;
192 =head2 analysis_method
194 Usage : $glimmer->analysis_method();
195 Purpose : Inherited method. Overridden to ensure that the name matches
196 /glimmer/i.
197 Returns : String
198 Argument : n/a
200 =cut
202 #-------------
203 sub analysis_method {
204 #-------------
205 my ($self, $method) = @_;
206 if($method && ($method !~ /glimmer/i)) {
207 $self->throw("method $method not supported in " . ref($self));
209 return $self->SUPER::analysis_method($method);
212 =head2 next_feature
214 Title : next_feature
215 Usage : while($gene = $glimmer->next_feature()) {
216 # do something
218 Function: Returns the next gene structure prediction of the Glimmer result
219 file. Call this method repeatedly until FALSE is returned.
221 The returned object is actually a SeqFeatureI implementing object.
222 This method is required for classes implementing the
223 SeqAnalysisParserI interface, and is merely an alias for
224 next_prediction() at present.
226 Example :
227 Returns : A Bio::Tools::Prediction::Gene object.
228 Args :
230 =cut
232 sub next_feature {
233 my ($self,@args) = @_;
234 # even though next_prediction doesn't expect any args (and this method
235 # does neither), we pass on args in order to be prepared if this changes
236 # ever
237 return $self->next_prediction(@args);
240 =head2 next_prediction
242 Title : next_prediction
243 Usage : while($gene = $glimmer->next_prediction()) {
244 # do something
246 Function: Returns the next gene structure prediction of the Glimmer result
247 file. Call this method repeatedly until FALSE is returned.
249 Example :
250 Returns : A Bio::Tools::Prediction::Gene object.
251 Args :
253 =cut
255 sub next_prediction {
256 my ($self) = @_;
257 my $gene;
259 # if the prediction section hasn't been parsed yet, we do this now
260 $self->_parse_predictions() unless $self->_predictions_parsed();
262 # get next gene structure
263 $gene = $self->_prediction();
264 return $gene;
267 =head2 _parse_predictions
269 Title : _parse_predictions()
270 Usage : $obj->_parse_predictions()
271 Function: Parses the prediction section. Automatically called by
272 next_prediction() if not yet done.
273 Example :
274 Returns :
276 =cut
278 sub _parse_predictions {
280 my ($self) = @_;
283 my %method = (
284 'Glimmer' => '_parse_prokaryotic',
285 'GlimmerM' => '_parse_eukaryotic',
286 'GlimmerHMM' => '_parse_eukaryotic',
287 '_DEFAULT_' => '_parse_eukaryotic',
290 my $format = $self->_format();
292 if (!$format) {
294 while (my $line = $self->_readline()) {
296 if ( $line =~ /^Glimmer\S*\s+\(Version\s*\S+\)/ ) {
297 $format = 'GlimmerM';
298 $self->_pushback($line);
299 last;
301 elsif ( $line =~ /^Glimmer\S*$/ ) {
302 $format = 'GlimmerHMM';
303 $self->_pushback($line);
304 last;
306 elsif ($line =~ /^Putative Genes:$/) {
307 $format = 'Glimmer';
308 $self->_pushback($line);
309 last;
311 elsif ($line =~ /^>(\S+)/) {
312 $format = 'Glimmer';
313 $self->_pushback($line);
314 last;
321 my $method =
322 (exists($method{$format})) ? $method{$format} : $method{'_DEFAULT_'};
324 return $self->$method();
329 =head2 _parse_eukaryotic
331 Title : _parse_eukaryotic()
332 Usage : $obj->_parse_eukaryotic()
333 Function: Parses the prediction section. Automatically called by
334 next_prediction() if not yet done.
335 Example :
336 Returns :
338 =cut
340 sub _parse_eukaryotic {
341 my ($self) = @_;
343 my ($gene,$seqname,$seqlen,$source,$lastgenenum);
345 while(defined($_ = $self->_readline())) {
346 if( /^(Glimmer\S*)\s+\(Version\s*(\S+)\)/ ) {
347 $source = "$1_$2";
348 next;
349 } elsif( /^(GlimmerHMM\S*)$/ ) { # GlimmerHMM has no version
350 $source = $1;
351 next;
352 } elsif(/^Sequence name:\s+(.+)$/ ) {
353 $seqname = $1;
354 next;
355 } elsif( /^Sequence length:\s+(\S+)/ ) {
356 $seqlen = $1;
357 next;
358 } elsif( m/^(Predicted genes)|(Gene)|\s+\#/ || /^\s+$/ ) {
359 next;
361 } elsif( # GlimmerM/HMM gene-exon prediction line
362 /^\s+(\d+)\s+ # gene num
363 (\d+)\s+ # exon num
364 ([\+\-])\s+ # strand
365 (\S+)\s+ # exon type
366 (\d+)\s+(\d+) # exon start, end
367 \s+(\d+) # exon length
368 /ox ) {
369 my ($genenum,$exonnum,$strand,$type,$start,$end,$len) =
370 ( $1,$2,$3,$4,$5,$6,$7);
371 if( ! $lastgenenum || $lastgenenum != $genenum) {
372 $self->_add_prediction($gene) if ( $gene );
373 $gene = Bio::Tools::Prediction::Gene->new
375 '-seq_id' => $seqname,
376 '-primary_tag' => "gene",
377 '-source_tag' => $source,
378 '-tag' => { 'Group' => "GenePrediction$genenum"},
381 my $exon = Bio::Tools::Prediction::Exon->new
382 ('-seq_id' => $seqname,
383 '-start' => $start,
384 '-end' => $end,
385 '-strand' => $strand eq '-' ? '-1' : '1',
386 '-source_tag' => $source,
387 '-primary_tag'=> 'exon',
388 '-tag' => { 'Group' => "GenePrediction$genenum"},
390 $gene->add_exon($exon,lc($type));
391 $lastgenenum = $genenum;
394 $self->_add_prediction($gene) if( $gene );
395 $self->_predictions_parsed(1);
398 =head2 _parse_prokaryotic
400 Title : _parse_prokaryotic()
401 Usage : $obj->_parse_prokaryotic()
402 Function: Parses the prediction section. Automatically called by
403 next_prediction() if not yet done.
404 Example :
405 Returns :
407 =cut
409 sub _parse_prokaryotic {
410 my ($self) = @_;
412 # default value, possibly overriden later
413 my $source = 'Glimmer';
415 # Store the sequence length(s) here, either from the
416 # seqlength arg to the constructor, or from the
417 # Glimmer 3.X detail file
418 my %seqlength = ( );
420 # Glimmer 2.X does not provide a sequence identifer
421 # in the prediction report (will default to unknown
422 # if not specified in the seqname arg to the
423 # constructor
425 # Glimmer 2.X does not report the length of the
426 # input sequence, either (will default to undef
427 # if not specified in the seqlength arg to the
428 # constructor
429 my $seqname = $self->_seqname();
430 my $seqlength = $self->_seqlength();
432 if (defined($seqlength)) {
433 $seqlength{$seqname} = $seqlength
436 # Parse the detail file, if we have one (Glimmer 3.X)
437 my $detail_file = $self->_detail_file();
439 if (defined($detail_file)) {
441 my $io = Bio::Root::IO->new(-file => $detail_file);
442 my $seqname;
444 while (defined($_ = $io->_readline())) {
445 if ($_ =~ /^>(\S+)/) {
446 $seqname = $1;
447 next;
450 if (defined($seqname) && ($_ =~ /^Sequence length = (\d+)$/)) {
451 $seqlength{$seqname} = $1;
452 next;
457 my $location_factory = Bio::Factory::FTLocationFactory->new();
459 while(defined($_ = $self->_readline())) {
460 # Glimmer 3.X does provide a sequence identifier -
461 # beware whitespace at the end (comes through from
462 # the fasta file)
463 if ($_ =~ /^Putative Genes:$/) {
464 $source = 'Glimmer_2.X';
465 next;
467 # Glimmer 3.X sequence identifier
468 elsif ($_ =~ /^>(\S+)/) {
469 $seqname = $1;
470 $seqlength = $seqlength{$seqname};
471 $source = 'Glimmer_3.X';
472 next;
474 elsif (
475 # Glimmer 2.X prediction
476 (/^\s+(\d+)\s+ # gene num
477 (\d+)\s+(\d+)\s+ # start, end
478 \[([\+\-])(\d{1})\s+ # strand, frame
479 /ox ) ||
480 # Glimmer 3.X prediction
481 (/^[^\d]+(\d+)\s+ # orf (numeric portion)
482 (\d+)\s+(\d+)\s+ # start, end
483 ([\+\-])(\d{1})\s+ # strand, frame
484 ([\d\.]+) # score
485 /ox)) {
486 my ($genenum,$start,$end,$strand,$frame,$score) =
487 ( $1,$2,$3,$4,$5,$6 );
489 my $circular_prediction = 0;
491 # Check for a circular prediction before we
492 # start fiddling with the coordinates
493 if ($strand eq '+') {
494 if ($start > $end) {
495 $circular_prediction = 1;
498 else {
499 if ($start < $end) {
500 $circular_prediction = 1;
504 if ($circular_prediction) {
505 unless (defined($seqlength)) {
506 $self->throw("need to know the sequence length to handle wraparound genes");
510 # Glimmer 2.X predictions do not include
511 # the stop codon - this might extend the
512 # prediction off either end of the sequence.
513 # This works fine even on circular/wraparound
514 # predictions.
515 if ($source eq 'Glimmer_2.X') {
516 if ($strand eq '+') {
517 $end += 3;
519 else {
520 $end -= 3;
524 # We might have extended a Glimmer 2.X prediction
525 # beyond the boundaries of the input sequence.
526 # Also, Glimmer 3.X (with -X) will output predictions
527 # with coordinates less than 1 or greater than the
528 # length of the sequence.
529 my ($fst, $fend);
530 foreach my $coord ($start, $end) {
531 if ($coord < 1) {
532 $coord = '<1';
533 $fst++;
534 } elsif (defined($seqlength) && ($coord > $seqlength)) {
535 $coord = ">$seqlength";
536 $fend++;
540 my $location_string;
542 if ($circular_prediction) {
543 if ($strand eq '+') {
544 $location_string = "join($start..$seqlength,1..$end)";
546 else {
547 $location_string = "join($start..1,$seqlength..$end)";
550 else {
551 # start must always be less than end for gene locations
552 if ($strand eq '-' && !$fst && !$fend && $start > $end) {
553 ($start, $end) = ($end, $start);
555 $location_string = "$start..$end";
558 my $location_object =
559 $location_factory->from_string($location_string);
561 # convert glimmer's frame range from 1-3 to SeqFeature's 0-2.
562 $frame--;
564 my $gene = Bio::SeqFeature::Generic->new
566 '-seq_id' => $seqname,
567 '-location' => $location_object,
568 '-strand' => $strand eq '-' ? '-1' : '1',
569 '-frame' => $frame,
570 '-source_tag' => $source,
571 '-display_name' => "orf$genenum",
572 '-primary_tag'=> 'gene',
573 '-tag' => { 'Group' => "GenePrediction_$genenum"},
574 '-score' => $score || undef
577 $self->_add_prediction($gene)
581 $self->_predictions_parsed(1);
584 =head2 _prediction
586 Title : _prediction()
587 Usage : $gene = $obj->_prediction()
588 Function: internal
589 Example :
590 Returns :
592 =cut
594 sub _prediction {
595 my ($self) = @_;
597 return unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
598 return shift(@{$self->{'_preds'}});
601 =head2 _add_prediction
603 Title : _add_prediction()
604 Usage : $obj->_add_prediction($gene)
605 Function: internal
606 Example :
607 Returns :
609 =cut
611 sub _add_prediction {
612 my ($self, $gene) = @_;
614 if(! exists($self->{'_preds'})) {
615 $self->{'_preds'} = [];
617 push(@{$self->{'_preds'}}, $gene);
620 =head2 _predictions_parsed
622 Title : _predictions_parsed
623 Usage : $obj->_predictions_parsed
624 Function: internal
625 Example :
626 Returns : TRUE or FALSE
628 =cut
630 sub _predictions_parsed {
631 my ($self, $val) = @_;
633 $self->{'_preds_parsed'} = $val if $val;
634 if(! exists($self->{'_preds_parsed'})) {
635 $self->{'_preds_parsed'} = 0;
637 return $self->{'_preds_parsed'};
640 =head2 _seqname
642 Title : _seqname
643 Usage : $obj->_seqname($seqname)
644 Function: internal (for Glimmer 2.X)
645 Example :
646 Returns : String
648 =cut
650 sub _seqname {
651 my ($self, $val) = @_;
653 $self->{'_seqname'} = $val if $val;
654 if(! exists($self->{'_seqname'})) {
655 $self->{'_seqname'} = 'unknown';
657 return $self->{'_seqname'};
660 =head2 _seqlength
662 Title : _seqlength
663 Usage : $obj->_seqlength($seqlength)
664 Function: internal (for Glimmer 2.X)
665 Example :
666 Returns : String
668 =cut
670 sub _seqlength {
671 my ($self, $val) = @_;
673 $self->{'_seqlength'} = $val if $val;
674 return $self->{'_seqlength'};
677 =head2 _format
679 Title : _format
680 Usage : $obj->_format($format)
681 Function: internal
682 Example :
683 Returns : String
685 =cut
687 sub _format {
688 my ($self, $val) = @_;
690 $self->{'_format'} = $val if $val;
692 return $self->{'_format'};
695 =head2 _detail_file
697 Title : _detail_file
698 Usage : $obj->_detail_file($filename)
699 Function: internal (for Glimmer 3.X)
700 Example :
701 Returns : String
703 =cut
705 sub _detail_file {
706 my ($self, $val) = @_;
708 $self->{'_detail_file'} = $val if $val;
709 return $self->{'_detail_file'};