[bug 2714]
[bioperl-live.git] / Bio / Tools / ESTScan.pm
blob76350f17a0de094fac98ac66a4068a2a46e21169
1 # $Id$
3 # BioPerl module for Bio::Tools::ESTScan
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
7 # Copyright Hilmar Lapp
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::ESTScan - Results of one ESTScan run
17 =head1 SYNOPSIS
19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
20 # filehandle:
21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
23 # parse the results
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
26 while($gene = $estscan->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene
28 foreach my $orf ($gene->exons()) {
29 # $orf is an instance of Bio::Tools::Prediction::Exon
30 $cds_str = $orf->predicted_cds();
34 # essential if you gave a filename at initialization (otherwise the file
35 # will stay open)
36 $estscan->close();
38 =head1 DESCRIPTION
40 The ESTScan module provides a parser for ESTScan coding region prediction
41 output.
43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
44 implements the L<Bio::SeqAnalysisParserI> interface.
45 See L<Bio::SeqAnalysisParserI>.
47 =head1 FEEDBACK
49 =head2 Mailing Lists
51 User feedback is an integral part of the evolution of this and other
52 Bioperl modules. Send your comments and suggestions preferably to one
53 of the Bioperl mailing lists. Your participation is much appreciated.
55 bioperl-l@bioperl.org - General discussion
56 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58 =head2 Reporting Bugs
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 the bugs and their resolution. Bug reports can be submitted via the
62 web:
64 http://bugzilla.open-bio.org/
66 =head1 AUTHOR - Hilmar Lapp
68 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
70 =head1 APPENDIX
72 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
74 =cut
76 # Let the code begin...
78 package Bio::Tools::ESTScan;
79 use strict;
80 use Symbol;
82 use Bio::Root::Root;
83 use Bio::Tools::Prediction::Exon;
85 use base qw(Bio::Tools::AnalysisResult);
87 sub _initialize_state {
88 my ($self,@args) = @_;
90 # first call the inherited method!
91 my $make = $self->SUPER::_initialize_state(@args);
93 if(! $self->analysis_method()) {
94 $self->analysis_method('ESTScan');
98 =head2 analysis_method
100 Usage : $estscan->analysis_method();
101 Purpose : Inherited method. Overridden to ensure that the name matches
102 /estscan/i.
103 Returns : String
104 Argument : n/a
106 =cut
108 #-------------
109 sub analysis_method {
110 #-------------
111 my ($self, $method) = @_;
112 if($method && ($method !~ /estscan/i)) {
113 $self->throw("method $method not supported in " . ref($self));
115 return $self->SUPER::analysis_method($method);
118 =head2 next_feature
120 Title : next_feature
121 Usage : while($orf = $estscan->next_feature()) {
122 # do something
124 Function: Returns the next gene structure prediction of the ESTScan result
125 file. Call this method repeatedly until FALSE is returned.
127 The returned object is actually a SeqFeatureI implementing object.
128 This method is required for classes implementing the
129 SeqAnalysisParserI interface, and is merely an alias for
130 next_prediction() at present.
132 Example :
133 Returns : A Bio::Tools::Prediction::Gene object.
134 Args :
136 =cut
138 sub next_feature {
139 my ($self,@args) = @_;
140 # even though next_prediction doesn't expect any args (and this method
141 # does neither), we pass on args in order to be prepared if this changes
142 # ever
143 return $self->next_prediction(@args);
146 =head2 next_prediction
148 Title : next_prediction
149 Usage : while($gene = $estscan->next_prediction()) {
150 # do something
152 Function: Returns the next gene structure prediction of the ESTScan result
153 file. Call this method repeatedly until FALSE is returned.
155 So far, this method DOES NOT work for reverse strand predictions,
156 even though the code looks like.
157 Example :
158 Returns : A Bio::Tools::Prediction::Gene object.
159 Args :
161 =cut
163 sub next_prediction {
164 my ($self) = @_;
165 my ($gene, $seq, $cds, $predobj);
166 my $numins = 0;
168 # predictions are in the format of FASTA sequences and can be parsed one
169 # at a time
170 $seq = $self->_fasta_stream()->next_seq();
171 return unless $seq;
172 # there is a new prediction
173 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction",
174 '-source' => "ESTScan");
175 # score starts the description
176 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
177 $self->throw("unexpected format of description: no score in " .
178 $seq->desc());
179 $gene->score($1);
180 $seq->desc($2);
181 # strand may end the description
182 if($seq->desc() =~ /(.*)minus strand$/) {
183 my $desc = $1;
184 $desc =~ s/;\s+$//;
185 $seq->desc($desc);
186 $gene->strand(-1);
187 } else {
188 $gene->strand(1);
190 # check for the format: default or 'all-in-one' (option -a)
191 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
192 # default format
193 $seq->desc($5);
194 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan",
195 '-start' => $3,
196 '-end' => $4);
197 $predobj->strand($gene->strand());
198 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
199 $predobj->primary_tag("InternalExon");
200 $predobj->seq_id($seq->display_id());
201 # add to gene structure object
202 $gene->add_exon($predobj);
203 # add predicted CDS
204 $cds = $seq->seq();
205 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
206 $cds = Bio::PrimarySeq->new('-seq' => $cds,
207 '-display_id' => $seq->display_id(),
208 '-desc' => $seq->desc(),
209 '-alphabet' => "dna");
210 $gene->predicted_cds($cds);
211 $predobj->predicted_cds($cds);
212 if($gene->strand() == -1) {
213 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
215 } else {
217 # All-in-one format (hopefully). This encodes the following information
218 # into the sequence:
219 # 1) untranslated regions: stretches of lower-case letters
220 # 2) translated regions: stretches of upper-case letters
221 # 3) insertions in the translated regions: capital X
222 # 4) deletions in the translated regions: a single lower-case letter
224 # if reverse strand ORF, save a lot of hassle by reversing the sequence
225 if($gene->strand() == -1) {
226 $seq = $seq->revcom();
228 my $seqstr = $seq->seq();
229 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
230 # leading 5'UTR
231 my $utr5 = $1;
232 # exon + 3'UTR
233 my $exonseq = $2;
234 # strip 3'UTR and following exons
235 if($exonseq =~ s/([a-z]{2,}.*)$//) {
236 $seqstr = $1;
237 } else {
238 $seqstr = "";
240 # start: take care of yielding the absolute coordinate
241 my $start = CORE::length($utr5) + 1;
242 if($predobj) {
243 $start += $predobj->end() + $numins;
245 # for the end coordinate, we need to subtract the insertions
246 $cds = $exonseq;
247 $cds =~ s/[X]//g;
248 my $end = $start + CORE::length($cds) - 1;
249 # construct next exon object
250 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start,
251 '-end' => $end);
252 $predobj->source_tag("ESTScan");
253 $predobj->primary_tag("InternalExon");
254 $predobj->seq_id($seq->display_id());
255 $predobj->strand($gene->strand());
256 $predobj->score($gene->score());
257 # add the exon to the gene structure object
258 $gene->add_exon($predobj);
259 # add the predicted CDS
260 $cds = $exonseq;
261 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
262 $cds = Bio::PrimarySeq->new('-seq' => $cds,
263 '-display_id' => $seq->display_id(),
264 '-desc' => $seq->desc(),
265 '-alphabet' => "dna");
266 # only store the first one in the overall prediction
267 $gene->predicted_cds($cds) unless $gene->predicted_cds();
268 $predobj->predicted_cds($cds);
269 # add the predicted insertions and deletions as subfeatures
270 # of the exon
271 my $fea = undef;
272 while($exonseq =~ /([a-zX])/g) {
273 my $indel = $1;
274 # start and end: start looking at the position after the
275 # previous feature
276 if($fea) {
277 $start = $fea->start()+$numins;
278 $start -= 1 if($fea->primary_tag() eq 'insertion');
279 } else {
280 $start = $predobj->start()+$numins-1;
282 #print "# numins = $numins, indel = $indel, start = $start\n";
283 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
284 $fea = Bio::SeqFeature::Generic->new('-start' => $start,
285 '-end' => $start);
286 $fea->source_tag("ESTScan");
287 $fea->seq_id($seq->display_id());
288 $fea->strand($predobj->strand());
289 if($indel eq 'X') {
290 # an insertion (depends on viewpoint: to get the 'real'
291 # CDS, a base has to be inserted, i.e., the HMMER model
292 # inserted a base; however, the sequencing process deleted
293 # a base that was there).
294 $fea->primary_tag("insertion");
295 # we need to count insertions because these are left out
296 # of any coordinates saved in the objects (which is correct
297 # because insertions change the original sequence, so
298 # coordinates wouldn't match)
299 $numins++;
300 } else {
301 # a deletion (depends on viewpoint: to get the 'real'
302 # CDS, a base has to be deleted, i.e., the HMMER model
303 # deleted a base; however, the sequencing process inserted
304 # a base that wasn't there).
305 $fea->primary_tag("deletion");
306 $fea->add_tag_value('base', $indel);
308 $predobj->add_sub_SeqFeature($fea);
313 return $gene;
316 =head2 close
318 Title : close
319 Usage : $result->close()
320 Function: Closes the file handle associated with this result file.
321 Inherited method, overridden.
322 Example :
323 Returns :
324 Args :
326 =cut
328 sub close {
329 my ($self, @args) = @_;
331 delete($self->{'_fastastream'});
332 $self->SUPER::close(@args);
335 =head2 _fasta_stream
337 Title : _fasta_stream
338 Usage : $result->_fasta_stream()
339 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
340 the file associated with this MZEF result object.
342 If called for the first time, creates the stream from the filehandle
343 if necessary.
344 Example :
345 Returns :
346 Args :
348 =cut
350 sub _fasta_stream {
351 my ($self, $stream) = @_;
353 if($stream || (! exists($self->{'_fastastream'}))) {
354 if(! $stream) {
355 $stream = Bio::SeqIO->new('-fh' => $self->_fh(),
356 '-format' => "fasta");
358 $self->{'_fastastream'} = $stream;
360 return $self->{'_fastastream'};