3 # BioPerl module for Bio::Tools::ESTScan
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
7 # Copyright Hilmar Lapp
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::ESTScan - Results of one ESTScan run
19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan');
21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT );
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
26 while($gene = $estscan->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene
28 foreach my $orf ($gene->exons()) {
29 # $orf is an instance of Bio::Tools::Prediction::Exon
30 $cds_str = $orf->predicted_cds();
34 # essential if you gave a filename at initialization (otherwise the file
40 The ESTScan module provides a parser for ESTScan coding region prediction
43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore
44 implements the L<Bio::SeqAnalysisParserI> interface.
45 See L<Bio::SeqAnalysisParserI>.
51 User feedback is an integral part of the evolution of this and other
52 Bioperl modules. Send your comments and suggestions preferably to one
53 of the Bioperl mailing lists. Your participation is much appreciated.
55 bioperl-l@bioperl.org - General discussion
56 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 the bugs and their resolution. Bug reports can be submitted via the
64 http://bugzilla.open-bio.org/
66 =head1 AUTHOR - Hilmar Lapp
68 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com)
72 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
76 # Let the code begin...
78 package Bio
::Tools
::ESTScan
;
83 use Bio
::Tools
::Prediction
::Exon
;
85 use base
qw(Bio::Tools::AnalysisResult);
87 sub _initialize_state
{
88 my ($self,@args) = @_;
90 # first call the inherited method!
91 my $make = $self->SUPER::_initialize_state
(@args);
93 if(! $self->analysis_method()) {
94 $self->analysis_method('ESTScan');
98 =head2 analysis_method
100 Usage : $estscan->analysis_method();
101 Purpose : Inherited method. Overridden to ensure that the name matches
109 sub analysis_method
{
111 my ($self, $method) = @_;
112 if($method && ($method !~ /estscan/i)) {
113 $self->throw("method $method not supported in " . ref($self));
115 return $self->SUPER::analysis_method
($method);
121 Usage : while($orf = $estscan->next_feature()) {
124 Function: Returns the next gene structure prediction of the ESTScan result
125 file. Call this method repeatedly until FALSE is returned.
127 The returned object is actually a SeqFeatureI implementing object.
128 This method is required for classes implementing the
129 SeqAnalysisParserI interface, and is merely an alias for
130 next_prediction() at present.
133 Returns : A Bio::Tools::Prediction::Gene object.
139 my ($self,@args) = @_;
140 # even though next_prediction doesn't expect any args (and this method
141 # does neither), we pass on args in order to be prepared if this changes
143 return $self->next_prediction(@args);
146 =head2 next_prediction
148 Title : next_prediction
149 Usage : while($gene = $estscan->next_prediction()) {
152 Function: Returns the next gene structure prediction of the ESTScan result
153 file. Call this method repeatedly until FALSE is returned.
155 So far, this method DOES NOT work for reverse strand predictions,
156 even though the code looks like.
158 Returns : A Bio::Tools::Prediction::Gene object.
163 sub next_prediction
{
165 my ($gene, $seq, $cds, $predobj);
168 # predictions are in the format of FASTA sequences and can be parsed one
170 $seq = $self->_fasta_stream()->next_seq();
172 # there is a new prediction
173 $gene = Bio
::Tools
::Prediction
::Gene
->new('-primary' => "ORFprediction",
174 '-source' => "ESTScan");
175 # score starts the description
176 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or
177 $self->throw("unexpected format of description: no score in " .
181 # strand may end the description
182 if($seq->desc() =~ /(.*)minus strand$/) {
190 # check for the format: default or 'all-in-one' (option -a)
191 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) {
194 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-source' => "ESTScan",
197 $predobj->strand($gene->strand());
198 $predobj->score($gene->score()); # FIXME or $1, or $2 ?
199 $predobj->primary_tag("InternalExon");
200 $predobj->seq_id($seq->display_id());
201 # add to gene structure object
202 $gene->add_exon($predobj);
205 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
206 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
207 '-display_id' => $seq->display_id(),
208 '-desc' => $seq->desc(),
209 '-alphabet' => "dna");
210 $gene->predicted_cds($cds);
211 $predobj->predicted_cds($cds);
212 if($gene->strand() == -1) {
213 $self->warn("reverse strand ORF, but unable to reverse coordinates!");
217 # All-in-one format (hopefully). This encodes the following information
219 # 1) untranslated regions: stretches of lower-case letters
220 # 2) translated regions: stretches of upper-case letters
221 # 3) insertions in the translated regions: capital X
222 # 4) deletions in the translated regions: a single lower-case letter
224 # if reverse strand ORF, save a lot of hassle by reversing the sequence
225 if($gene->strand() == -1) {
226 $seq = $seq->revcom();
228 my $seqstr = $seq->seq();
229 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) {
234 # strip 3'UTR and following exons
235 if($exonseq =~ s/([a-z]{2,}.*)$//) {
240 # start: take care of yielding the absolute coordinate
241 my $start = CORE
::length($utr5) + 1;
243 $start += $predobj->end() + $numins;
245 # for the end coordinate, we need to subtract the insertions
248 my $end = $start + CORE
::length($cds) - 1;
249 # construct next exon object
250 $predobj = Bio
::Tools
::Prediction
::Exon
->new('-start' => $start,
252 $predobj->source_tag("ESTScan");
253 $predobj->primary_tag("InternalExon");
254 $predobj->seq_id($seq->display_id());
255 $predobj->strand($gene->strand());
256 $predobj->score($gene->score());
257 # add the exon to the gene structure object
258 $gene->add_exon($predobj);
259 # add the predicted CDS
261 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions
262 $cds = Bio
::PrimarySeq
->new('-seq' => $cds,
263 '-display_id' => $seq->display_id(),
264 '-desc' => $seq->desc(),
265 '-alphabet' => "dna");
266 # only store the first one in the overall prediction
267 $gene->predicted_cds($cds) unless $gene->predicted_cds();
268 $predobj->predicted_cds($cds);
269 # add the predicted insertions and deletions as subfeatures
272 while($exonseq =~ /([a-zX])/g) {
274 # start and end: start looking at the position after the
277 $start = $fea->start()+$numins;
278 $start -= 1 if($fea->primary_tag() eq 'insertion');
280 $start = $predobj->start()+$numins-1;
282 #print "# numins = $numins, indel = $indel, start = $start\n";
283 $start = index($seq->seq(), $indel, $start) + 1 - $numins;
284 $fea = Bio
::SeqFeature
::Generic
->new('-start' => $start,
286 $fea->source_tag("ESTScan");
287 $fea->seq_id($seq->display_id());
288 $fea->strand($predobj->strand());
290 # an insertion (depends on viewpoint: to get the 'real'
291 # CDS, a base has to be inserted, i.e., the HMMER model
292 # inserted a base; however, the sequencing process deleted
293 # a base that was there).
294 $fea->primary_tag("insertion");
295 # we need to count insertions because these are left out
296 # of any coordinates saved in the objects (which is correct
297 # because insertions change the original sequence, so
298 # coordinates wouldn't match)
301 # a deletion (depends on viewpoint: to get the 'real'
302 # CDS, a base has to be deleted, i.e., the HMMER model
303 # deleted a base; however, the sequencing process inserted
304 # a base that wasn't there).
305 $fea->primary_tag("deletion");
306 $fea->add_tag_value('base', $indel);
308 $predobj->add_sub_SeqFeature($fea);
319 Usage : $result->close()
320 Function: Closes the file handle associated with this result file.
321 Inherited method, overridden.
329 my ($self, @args) = @_;
331 delete($self->{'_fastastream'});
332 $self->SUPER::close(@args);
337 Title : _fasta_stream
338 Usage : $result->_fasta_stream()
339 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of
340 the file associated with this MZEF result object.
342 If called for the first time, creates the stream from the filehandle
351 my ($self, $stream) = @_;
353 if($stream || (! exists($self->{'_fastastream'}))) {
355 $stream = Bio
::SeqIO
->new('-fh' => $self->_fh(),
356 '-format' => "fasta");
358 $self->{'_fastastream'} = $stream;
360 return $self->{'_fastastream'};