3 # Cared for by Keith James
5 # Copyright Genome Research Ltd.
7 # You may distribute this module under the same terms as Perl itself
9 # POD documentation - main docs before the code
13 Bio::Tools::Geneid - Results of one geneid run
17 use Bio::Tools::Geneid;
18 my $gid = Bio::Tools::Geneid(-file => "geneid.out");
20 while (my $gene = $gid->next_prediction)
22 my @transcripts = $gene->transcripts;
23 foreach my $t (@transcripts)
25 my @exons = $t->exons;
26 foreach my $e (@exons)
28 printf("Exon %d..%d\n", $e->start, $e->end);
35 This is the parser for the output of geneid by Enrique Blanco and
36 Roderic Guigó (IMIM-UPF). See http://www1.imim.es/software/geneid. It
37 relies on native geneid output format internally and will work with
38 geneid versions 1.0 and 1.1. Currently this module supports only the
39 default mode of operation which is to predict exons and assemble an
40 optimal gene prediction.
42 It takes either a file handle or a file name and returns a
43 Bio::SeqFeature::Gene::GeneStructure object.
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to one
51 of the Bioperl mailing lists. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 the bugs and their resolution. Bug reports can be submitted via the
62 http://bugzilla.open-bio.org/
64 =head1 AUTHOR - Keith James
66 Email: kdj@sanger.ac.uk
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
75 package Bio
::Tools
::Geneid
;
77 use vars
qw($SOURCE_TAG);
80 use Bio::Tools::AnalysisResult;
81 use Bio::SeqFeature::Generic;
82 use Bio::SeqFeature::Gene::Exon;
83 use Bio::SeqFeature::Gene::Transcript;
84 use Bio::SeqFeature::Gene::GeneStructure;
86 use base qw(Bio::Root::Root Bio::Root::IO);
87 $SOURCE_TAG = 'geneid';
92 Usage : $obj->new(-file = "<geneid.out");
93 $obj->new(-fh => \*GI);
94 Function: Constructor for geneid wrapper. Takes either a file
96 Returns : L<Bio::Tools::Geneid>
102 my($class, @args) = @_;
103 my $self = $class->SUPER::new
(@args);
104 $self->_initialize_io(@args);
108 =head2 next_prediction
110 Title : next_prediction
111 Usage : while($gene = $geneid->next_prediction)
115 Function: Returns the gene structure prediction of the geneid result
116 file. Call this method repeatedly until FALSE is returned.
117 Returns : A Bio::SeqFeature::Gene::GeneStructure object
126 my ($gene, $transcript, $current_gene_id);
127 my $transcript_score = 0;
129 my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score,
130 $exon_strand, $start_phase, $end_phase, $start_sig_score,
131 $end_sig_score, $coding_pot_score, $homol_score);
133 while (defined($_ = $self->_readline))
140 # We have a choice of geneid, gff or XML formats. The native
141 # geneid format has more information than gff. However, we
142 # then need to perform the hack of extracting the sequence ID
143 # from the header of the embedded Fasta file which comes after
144 # the exon data, as it is not stored elsewhere. Ack.
145 # the new version of geneID includes the sequence ID in a slightly
146 # different format and a new "or" statement was added below
147 # also removed "unless defined $self->_target_id;" inorder to continue
148 # generating new sequence IDs.
150 if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/)
153 $self->_target_id($target_id);
157 next unless (/(Single|First|Internal|Terminal)/);
159 my @fields = split(/\s+/, $_);
161 # Grab gene_id from eol first as there are issues with
162 # inconsistent whitespace in the AA coords field
163 $gene_id = pop @fields;
165 ($exon_type, $exon_start, $exon_end, $exon_score,
166 $exon_strand, $start_phase, $end_phase, $start_sig_score,
167 $end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10];
169 if (! defined $current_gene_id)
171 # Starting the requested prediction
172 $current_gene_id = $gene_id;
173 $transcript_score = $exon_score;
175 $gene = Bio
::SeqFeature
::Gene
::GeneStructure
->new(-source
=>
177 $transcript = Bio
::SeqFeature
::Gene
::Transcript
->new(-source
=>
180 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
181 $exon_strand, $start_phase, $end_phase, $start_sig_score,
182 $end_sig_score, $coding_pot_score, $homol_score);
184 elsif ($gene_id eq $current_gene_id)
186 # Still in requested prediction
187 $transcript_score += $exon_score;
189 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
190 $exon_strand, $start_phase, $end_phase, $start_sig_score,
191 $end_sig_score, $coding_pot_score, $homol_score);
195 # Found following prediction
196 $self->_pushback($_);
203 $transcript->seq_id($self->_target_id);
204 $transcript->score($transcript_score);
205 $gene->add_transcript($transcript);
206 $gene->seq_id($self->_target_id);
208 foreach my $exon ($gene->exons)
210 $exon->seq_id($self->_target_id);
213 $self->_set_strand($gene);
222 Usage : $obj->_add_exon($gene, $transcript, ... exon data ...)
223 Function: Adds a new exon to both gene and transcript from the data
232 my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end,
233 $exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score,
234 $end_sig_score, $coding_pot_score, $homol_score) = @_;
236 $exon_type =~ s/First/Initial/;
238 my $strand = $exon_strand eq '+' ?
1 : -1;
240 my $exon = Bio
::SeqFeature
::Gene
::Exon
->new(-source
=> $SOURCE_TAG,
241 -start
=> $exon_start,
244 -score
=> $exon_score);
246 $exon->add_tag_value("Type", $exon_type);
247 $exon->add_tag_value('phase', $start_phase);
248 $exon->add_tag_value('end_phase', $end_phase);
249 $exon->add_tag_value('start_signal_score', $start_sig_score);
250 $exon->add_tag_value('end_signal_score', $end_sig_score);
251 $exon->add_tag_value('coding_potential_score', $coding_pot_score);
252 $exon->add_tag_value('homology_score', $homol_score);
254 $transcript->strand($strand) unless $transcript->strand != 0;
255 $transcript->add_exon($exon, $exon_type);
261 Usage : $obj->_set_strand($gene)
262 Function: Sets the overall gene strand to the same strand as all
263 : the exons if they are all on the same strand, or to strand 0
264 : if the exons are on different strands.
272 my ($self, $gene) = @_;
277 my @exons = $gene->exons;
278 foreach my $exon (@exons)
280 my $strand = $exon->strand;
286 elsif ($strand == -1)
296 elsif ($#exons == $rev)
311 Usage : $obj->_target_id
312 Function: get/set for genomic sequence id
314 Returns : A target ID
320 my ($self,$val) = @_;
323 $self->{'_target_id'} = $val;
326 return $self->{'_target_id'};