tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Tools / ESTScan.pm
blobf37e6cf59c373ac42134ac1d241daee137e62fa7
1 # $Id$
3 # BioPerl module for Bio::Tools::ESTScan
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Hilmar Lapp <hlapp@gmx.net>
9 # Copyright Hilmar Lapp
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::ESTScan - Results of one ESTScan run
19 =head1 SYNOPSIS
21 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
22 # filehandle:
23 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
25 # parse the results
26 # note: this class is-a Bio::Tools::AnalysisResult which implements
27 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
28 while($gene = $estscan->next_prediction()) {
29 # $gene is an instance of Bio::Tools::Prediction::Gene
30 foreach my $orf ($gene->exons()) {
31 # $orf is an instance of Bio::Tools::Prediction::Exon
32 $cds_str = $orf->predicted_cds();
36 # essential if you gave a filename at initialization (otherwise the file
37 # will stay open)
38 $estscan->close();
40 =head1 DESCRIPTION
42 The ESTScan module provides a parser for ESTScan coding region prediction
43 output.
45 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
46 implements the L<Bio::SeqAnalysisParserI> interface.
47 See L<Bio::SeqAnalysisParserI>.
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to one
55 of the Bioperl mailing lists. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 http://bugzilla.open-bio.org/
79 =head1 AUTHOR - Hilmar Lapp
81 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
83 =head1 APPENDIX
85 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
87 =cut
89 # Let the code begin...
91 package Bio::Tools::ESTScan;
92 use strict;
93 use Symbol;
95 use Bio::Root::Root;
96 use Bio::Tools::Prediction::Exon;
98 use base qw(Bio::Tools::AnalysisResult);
100 sub _initialize_state {
101 my ($self,@args) = @_;
103 # first call the inherited method!
104 my $make = $self->SUPER::_initialize_state(@args);
106 if(! $self->analysis_method()) {
107 $self->analysis_method('ESTScan');
111 =head2 analysis_method
113 Usage : $estscan->analysis_method();
114 Purpose : Inherited method. Overridden to ensure that the name matches
115 /estscan/i.
116 Returns : String
117 Argument : n/a
119 =cut
121 #-------------
122 sub analysis_method {
123 #-------------
124 my ($self, $method) = @_;
125 if($method && ($method !~ /estscan/i)) {
126 $self->throw("method $method not supported in " . ref($self));
128 return $self->SUPER::analysis_method($method);
131 =head2 next_feature
133 Title : next_feature
134 Usage : while($orf = $estscan->next_feature()) {
135 # do something
137 Function: Returns the next gene structure prediction of the ESTScan result
138 file. Call this method repeatedly until FALSE is returned.
140 The returned object is actually a SeqFeatureI implementing object.
141 This method is required for classes implementing the
142 SeqAnalysisParserI interface, and is merely an alias for
143 next_prediction() at present.
145 Example :
146 Returns : A Bio::Tools::Prediction::Gene object.
147 Args :
149 =cut
151 sub next_feature {
152 my ($self,@args) = @_;
153 # even though next_prediction doesn't expect any args (and this method
154 # does neither), we pass on args in order to be prepared if this changes
155 # ever
156 return $self->next_prediction(@args);
159 =head2 next_prediction
161 Title : next_prediction
162 Usage : while($gene = $estscan->next_prediction()) {
163 # do something
165 Function: Returns the next gene structure prediction of the ESTScan result
166 file. Call this method repeatedly until FALSE is returned.
168 So far, this method DOES NOT work for reverse strand predictions,
169 even though the code looks like.
170 Example :
171 Returns : A Bio::Tools::Prediction::Gene object.
172 Args :
174 =cut
176 sub next_prediction {
177 my ($self) = @_;
178 my ($gene, $seq, $cds, $predobj);
179 my $numins = 0;
181 # predictions are in the format of FASTA sequences and can be parsed one
182 # at a time
183 $seq = $self->_fasta_stream()->next_seq();
184 return unless $seq;
185 # there is a new prediction
186 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction",
187 '-source' => "ESTScan");
188 # score starts the description
189 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
190 $self->throw("unexpected format of description: no score in " .
191 $seq->desc());
192 $gene->score($1);
193 $seq->desc($2);
194 # strand may end the description
195 if($seq->desc() =~ /(.*)minus strand$/) {
196 my $desc = $1;
197 $desc =~ s/;\s+$//;
198 $seq->desc($desc);
199 $gene->strand(-1);
200 } else {
201 $gene->strand(1);
203 # check for the format: default or 'all-in-one' (option -a)
204 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
205 # default format
206 $seq->desc($5);
207 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan",
208 '-start' => $3,
209 '-end' => $4);
210 $predobj->strand($gene->strand());
211 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
212 $predobj->primary_tag("InternalExon");
213 $predobj->seq_id($seq->display_id());
214 # add to gene structure object
215 $gene->add_exon($predobj);
216 # add predicted CDS
217 $cds = $seq->seq();
218 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
219 $cds = Bio::PrimarySeq->new('-seq' => $cds,
220 '-display_id' => $seq->display_id(),
221 '-desc' => $seq->desc(),
222 '-alphabet' => "dna");
223 $gene->predicted_cds($cds);
224 $predobj->predicted_cds($cds);
225 if($gene->strand() == -1) {
226 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
228 } else {
230 # All-in-one format (hopefully). This encodes the following information
231 # into the sequence:
232 # 1) untranslated regions: stretches of lower-case letters
233 # 2) translated regions: stretches of upper-case letters
234 # 3) insertions in the translated regions: capital X
235 # 4) deletions in the translated regions: a single lower-case letter
237 # if reverse strand ORF, save a lot of hassle by reversing the sequence
238 if($gene->strand() == -1) {
239 $seq = $seq->revcom();
241 my $seqstr = $seq->seq();
242 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
243 # leading 5'UTR
244 my $utr5 = $1;
245 # exon + 3'UTR
246 my $exonseq = $2;
247 # strip 3'UTR and following exons
248 if($exonseq =~ s/([a-z]{2,}.*)$//) {
249 $seqstr = $1;
250 } else {
251 $seqstr = "";
253 # start: take care of yielding the absolute coordinate
254 my $start = CORE::length($utr5) + 1;
255 if($predobj) {
256 $start += $predobj->end() + $numins;
258 # for the end coordinate, we need to subtract the insertions
259 $cds = $exonseq;
260 $cds =~ s/[X]//g;
261 my $end = $start + CORE::length($cds) - 1;
262 # construct next exon object
263 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start,
264 '-end' => $end);
265 $predobj->source_tag("ESTScan");
266 $predobj->primary_tag("InternalExon");
267 $predobj->seq_id($seq->display_id());
268 $predobj->strand($gene->strand());
269 $predobj->score($gene->score());
270 # add the exon to the gene structure object
271 $gene->add_exon($predobj);
272 # add the predicted CDS
273 $cds = $exonseq;
274 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
275 $cds = Bio::PrimarySeq->new('-seq' => $cds,
276 '-display_id' => $seq->display_id(),
277 '-desc' => $seq->desc(),
278 '-alphabet' => "dna");
279 # only store the first one in the overall prediction
280 $gene->predicted_cds($cds) unless $gene->predicted_cds();
281 $predobj->predicted_cds($cds);
282 # add the predicted insertions and deletions as subfeatures
283 # of the exon
284 my $fea = undef;
285 while($exonseq =~ /([a-zX])/g) {
286 my $indel = $1;
287 # start and end: start looking at the position after the
288 # previous feature
289 if($fea) {
290 $start = $fea->start()+$numins;
291 $start -= 1 if($fea->primary_tag() eq 'insertion');
292 } else {
293 $start = $predobj->start()+$numins-1;
295 #print "# numins = $numins, indel = $indel, start = $start\n";
296 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
297 $fea = Bio::SeqFeature::Generic->new('-start' => $start,
298 '-end' => $start);
299 $fea->source_tag("ESTScan");
300 $fea->seq_id($seq->display_id());
301 $fea->strand($predobj->strand());
302 if($indel eq 'X') {
303 # an insertion (depends on viewpoint: to get the 'real'
304 # CDS, a base has to be inserted, i.e., the HMMER model
305 # inserted a base; however, the sequencing process deleted
306 # a base that was there).
307 $fea->primary_tag("insertion");
308 # we need to count insertions because these are left out
309 # of any coordinates saved in the objects (which is correct
310 # because insertions change the original sequence, so
311 # coordinates wouldn't match)
312 $numins++;
313 } else {
314 # a deletion (depends on viewpoint: to get the 'real'
315 # CDS, a base has to be deleted, i.e., the HMMER model
316 # deleted a base; however, the sequencing process inserted
317 # a base that wasn't there).
318 $fea->primary_tag("deletion");
319 $fea->add_tag_value('base', $indel);
321 $predobj->add_sub_SeqFeature($fea);
326 return $gene;
329 =head2 close
331 Title : close
332 Usage : $result->close()
333 Function: Closes the file handle associated with this result file.
334 Inherited method, overridden.
335 Example :
336 Returns :
337 Args :
339 =cut
341 sub close {
342 my ($self, @args) = @_;
344 delete($self->{'_fastastream'});
345 $self->SUPER::close(@args);
348 =head2 _fasta_stream
350 Title : _fasta_stream
351 Usage : $result->_fasta_stream()
352 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
353 the file associated with this MZEF result object.
355 If called for the first time, creates the stream from the filehandle
356 if necessary.
357 Example :
358 Returns :
359 Args :
361 =cut
363 sub _fasta_stream {
364 my ($self, $stream) = @_;
366 if($stream || (! exists($self->{'_fastastream'}))) {
367 if(! $stream) {
368 $stream = Bio::SeqIO->new('-fh' => $self->_fh(),
369 '-format' => "fasta");
371 $self->{'_fastastream'} = $stream;
373 return $self->{'_fastastream'};