small update
[bioperl-live.git] / Bio / Tools / Geneid.pm
blobf42d8de14be70808775a7cf59fb5c636ab4c1127
1 # $Id$
3 # Cared for by Keith James
5 # Copyright Genome Research Ltd.
7 # You may distribute this module under the same terms as Perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Tools::Geneid - Results of one geneid run
15 =head1 SYNOPSIS
17 use Bio::Tools::Geneid;
18 my $gid = Bio::Tools::Geneid(-file => "geneid.out");
20 while (my $gene = $gid->next_prediction)
22 my @transcripts = $gene->transcripts;
23 foreach my $t (@transcripts)
25 my @exons = $t->exons;
26 foreach my $e (@exons)
28 printf("Exon %d..%d\n", $e->start, $e->end);
33 =head1 DESCRIPTION
35 This is the parser for the output of geneid by Enrique Blanco and
36 Roderic Guigó (IMIM-UPF). See http://www1.imim.es/software/geneid. It
37 relies on native geneid output format internally and will work with
38 geneid versions 1.0 and 1.1. Currently this module supports only the
39 default mode of operation which is to predict exons and assemble an
40 optimal gene prediction.
42 It takes either a file handle or a file name and returns a
43 Bio::SeqFeature::Gene::GeneStructure object.
45 =head1 FEEDBACK
47 =head2 Mailing Lists
49 User feedback is an integral part of the evolution of this and other
50 Bioperl modules. Send your comments and suggestions preferably to one
51 of the Bioperl mailing lists. Your participation is much appreciated.
53 bioperl-l@bioperl.org - General discussion
54 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 the bugs and their resolution. Bug reports can be submitted via the
60 web:
62 http://bugzilla.open-bio.org/
64 =head1 AUTHOR - Keith James
66 Email: kdj@sanger.ac.uk
68 =head1 APPENDIX
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
73 =cut
75 package Bio::Tools::Geneid;
77 use vars qw($SOURCE_TAG);
78 use strict;
80 use Bio::Tools::AnalysisResult;
81 use Bio::SeqFeature::Generic;
82 use Bio::SeqFeature::Gene::Exon;
83 use Bio::SeqFeature::Gene::Transcript;
84 use Bio::SeqFeature::Gene::GeneStructure;
86 use base qw(Bio::Root::Root Bio::Root::IO);
87 $SOURCE_TAG = 'geneid';
89 =head2 new
91 Title : new
92 Usage : $obj->new(-file = "<geneid.out");
93 $obj->new(-fh => \*GI);
94 Function: Constructor for geneid wrapper. Takes either a file
95 : or filehandle
96 Returns : L<Bio::Tools::Geneid>
98 =cut
100 sub new
102 my($class, @args) = @_;
103 my $self = $class->SUPER::new(@args);
104 $self->_initialize_io(@args);
105 return $self;
108 =head2 next_prediction
110 Title : next_prediction
111 Usage : while($gene = $geneid->next_prediction)
113 # do something
115 Function: Returns the gene structure prediction of the geneid result
116 file. Call this method repeatedly until FALSE is returned.
117 Returns : A Bio::SeqFeature::Gene::GeneStructure object
118 Args : None
120 =cut
122 sub next_prediction
124 my ($self) = @_;
126 my ($gene, $transcript, $current_gene_id);
127 my $transcript_score = 0;
129 my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score,
130 $exon_strand, $start_phase, $end_phase, $start_sig_score,
131 $end_sig_score, $coding_pot_score, $homol_score);
133 while (defined($_ = $self->_readline))
135 $self->debug($_);
137 s/^\s+//;
138 s/\s+$//;
140 # We have a choice of geneid, gff or XML formats. The native
141 # geneid format has more information than gff. However, we
142 # then need to perform the hack of extracting the sequence ID
143 # from the header of the embedded Fasta file which comes after
144 # the exon data, as it is not stored elsewhere. Ack.
145 # the new version of geneID includes the sequence ID in a slightly
146 # different format and a new "or" statement was added below
147 # also removed "unless defined $self->_target_id;" inorder to continue
148 # generating new sequence IDs.
150 if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/)
152 my $target_id = $1;
153 $self->_target_id($target_id);
154 next;
157 next unless (/(Single|First|Internal|Terminal)/);
159 my @fields = split(/\s+/, $_);
161 # Grab gene_id from eol first as there are issues with
162 # inconsistent whitespace in the AA coords field
163 $gene_id = pop @fields;
165 ($exon_type, $exon_start, $exon_end, $exon_score,
166 $exon_strand, $start_phase, $end_phase, $start_sig_score,
167 $end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10];
169 if (! defined $current_gene_id)
171 # Starting the requested prediction
172 $current_gene_id = $gene_id;
173 $transcript_score = $exon_score;
175 $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source =>
176 $SOURCE_TAG);
177 $transcript = Bio::SeqFeature::Gene::Transcript->new(-source =>
178 $SOURCE_TAG);
180 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
181 $exon_strand, $start_phase, $end_phase, $start_sig_score,
182 $end_sig_score, $coding_pot_score, $homol_score);
184 elsif ($gene_id eq $current_gene_id)
186 # Still in requested prediction
187 $transcript_score += $exon_score;
189 $self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score,
190 $exon_strand, $start_phase, $end_phase, $start_sig_score,
191 $end_sig_score, $coding_pot_score, $homol_score);
193 else
195 # Found following prediction
196 $self->_pushback($_);
197 last;
201 if (defined $gene)
203 $transcript->seq_id($self->_target_id);
204 $transcript->score($transcript_score);
205 $gene->add_transcript($transcript);
206 $gene->seq_id($self->_target_id);
208 foreach my $exon ($gene->exons)
210 $exon->seq_id($self->_target_id);
213 $self->_set_strand($gene);
216 return $gene;
219 =head2 _add_exon
221 Title : _add_exon
222 Usage : $obj->_add_exon($gene, $transcript, ... exon data ...)
223 Function: Adds a new exon to both gene and transcript from the data
224 : supplied as args
225 Example :
226 Returns : Nothing
228 =cut
230 sub _add_exon
232 my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end,
233 $exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score,
234 $end_sig_score, $coding_pot_score, $homol_score) = @_;
236 $exon_type =~ s/First/Initial/;
238 my $strand = $exon_strand eq '+' ? 1 : -1;
240 my $exon = Bio::SeqFeature::Gene::Exon->new(-source => $SOURCE_TAG,
241 -start => $exon_start,
242 -end => $exon_end,
243 -strand => $strand,
244 -score => $exon_score);
245 $exon->is_coding(1);
246 $exon->add_tag_value("Type", $exon_type);
247 $exon->add_tag_value('phase', $start_phase);
248 $exon->add_tag_value('end_phase', $end_phase);
249 $exon->add_tag_value('start_signal_score', $start_sig_score);
250 $exon->add_tag_value('end_signal_score', $end_sig_score);
251 $exon->add_tag_value('coding_potential_score', $coding_pot_score);
252 $exon->add_tag_value('homology_score', $homol_score);
254 $transcript->strand($strand) unless $transcript->strand != 0;
255 $transcript->add_exon($exon, $exon_type);
258 =head2 _set_strand
260 Title : _set_strand
261 Usage : $obj->_set_strand($gene)
262 Function: Sets the overall gene strand to the same strand as all
263 : the exons if they are all on the same strand, or to strand 0
264 : if the exons are on different strands.
265 Example :
266 Returns : Nothing
268 =cut
270 sub _set_strand
272 my ($self, $gene) = @_;
274 my $fwd = 0;
275 my $rev = 0;
277 my @exons = $gene->exons;
278 foreach my $exon (@exons)
280 my $strand = $exon->strand;
282 if ($strand == 1)
284 $fwd++;
286 elsif ($strand == -1)
288 $rev++;
292 if ($#exons == $fwd)
294 $gene->strand(1);
296 elsif ($#exons == $rev)
298 $gene->strand(-1);
300 else
302 $gene->strand(0);
305 return $gene;
308 =head2 _target_id
310 Title : _target_id
311 Usage : $obj->_target_id
312 Function: get/set for genomic sequence id
313 Example :
314 Returns : A target ID
316 =cut
318 sub _target_id
320 my ($self,$val) = @_;
321 if ($val)
323 $self->{'_target_id'} = $val;
326 return $self->{'_target_id'};