sync trunk with branch
[bioperl-live.git] / Bio / Tools / ERPIN.pm
blob11a8698e59a9ce748200430aa45fd42267b4bd0b
1 # $Id$
3 # BioPerl module for Bio::Tools::ERPIN
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::ERPIN - a parser for ERPIN output
17 =head1 SYNOPSIS
19 use Bio::Tools::ERPIN;
20 my $parser = Bio::Tools::ERPIN->new( -file => $rna_output,
21 -motiftag => 'protein_bind'
22 -desctag => 'TRAP_binding');
23 #parse the results
24 while( my $motif = $parser->next_prediction) {
25 # do something here
28 =head1 DESCRIPTION
30 Parses raw ERPIN output.
32 This module is not currently complete. As is, it will parse raw
33 ERPIN long format output and pack information into
34 Bio::SeqFeature::Generic objects.
36 Several values have also been added in the 'tag' hash. These can be
37 accessed using the following syntax:
39 my ($entry) = $feature->get_Annotations('SecStructure');
41 Added tags are :
42 tset - training set used for the sequence
43 tsetdesc - training set description line
44 cutoff - cutoff value used
45 database - name of database
46 dbdesc - description of database
47 dbratios - nucleotide ratios of database (used to calculate evalue)
48 descline - entire description line (in case the regex used for
49 sequence ID doesn't adequately catch the name
50 accession - accession number of sequence (if present)
51 logodds - logodds score value
52 sequence - sequence from hit, separated based on training set
54 See t/ERPIN.t for example usage.
56 At some point a more complicated feature object may be used to support
57 this data rather than forcing most of the information into tag/value
58 pairs in a SeqFeature::Generic. This will hopefully allow for more
59 flexible analysis of data (specifically RNA secondary structural
60 data).
62 =head1 FEEDBACK
64 =head2 Mailing Lists
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to
68 the Bioperl mailing list. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 =head2 Reporting Bugs
75 Report bugs to the Bioperl bug tracking system to help us keep track
76 of the bugs and their resolution. Bug reports can be submitted via the
77 web:
79 http://bugzilla.open-bio.org/
81 =head1 AUTHOR - Chris Fields
83 Email cjfields-at-uiuc-dot-edu
85 =head1 APPENDIX
87 The rest of the documentation details each of the object methods.
88 Internal methods are usually preceded with a _
90 =cut
92 # Let the code begin...
94 package Bio::Tools::ERPIN;
95 use strict;
97 use Bio::SeqFeature::Generic;
99 use base qw(Bio::Tools::AnalysisResult);
101 our($MotifTag,$SrcTag,$DescTag) = qw(misc_binding ERPIN erpin);
103 =head2 new
105 Title : new
106 Usage : my $obj = Bio::Tools::ERPIN->new();
107 Function: Builds a new Bio::Tools::ERPIN object
108 Returns : an instance of Bio::Tools::ERPIN
109 Args : -fh/-file for input filename
110 -motiftag => primary tag used in gene features (default 'misc_binding')
111 -desctag => tag used for display_name name (default 'erpin')
112 -srctag => source tag used in all features (default 'ERPIN')
114 =cut
116 sub _initialize {
117 my($self,@args) = @_;
118 $self->warn('Use of this module is deprecated. Use Bio::SearchIO::erpin instead');
119 $self->SUPER::_initialize(@args);
120 my ($motiftag,$desctag,$srctag) = $self->SUPER::_rearrange([qw(MOTIFTAG
121 DESCTAG
122 SRCTAG
124 @args);
125 $self->motif_tag(defined $motiftag ? $motiftag : $MotifTag);
126 $self->source_tag(defined $srctag ? $srctag : $SrcTag);
127 $self->desc_tag(defined $desctag ? $desctag : $DescTag);
128 foreach (qw(_tset _tset_desc _cutoff _db _db_desc
129 _db_ratios _eval_cutoff _seqid _secacc _seqdesc )) {
130 $self->{$_}='';
134 =head2 motif_tag
136 Title : motiftag
137 Usage : $obj->motiftag($newval)
138 Function: Get/Set the value used for 'motif_tag', which is used for setting the
139 primary_tag.
140 Default is 'misc_binding' as set by the global $MotifTag.
141 'misc_binding' is used here because a conserved RNA motif is capable
142 of binding proteins (regulatory proteins), antisense RNA (siRNA),
143 small molecules (riboswitches), or nothing at all (tRNA,
144 terminators, etc.). It is recommended that this be changed to other
145 tags ('misc_RNA', 'protein_binding', 'tRNA', etc.) where appropriate.
146 For more information, see:
147 http://www.ncbi.nlm.nih.gov/collab/FT/index.html
148 Returns : value of motif_tag (a scalar)
149 Args : on set, new value (a scalar or undef, optional)
151 =cut
153 sub motif_tag{
154 my $self = shift;
156 return $self->{'motif_tag'} = shift if @_;
157 return $self->{'motif_tag'};
160 =head2 source_tag
162 Title : source_tag
163 Usage : $obj->source_tag($newval)
164 Function: Get/Set the value used for the 'source_tag'.
165 Default is 'ERPIN' as set by the global $SrcTag
166 Returns : value of source_tag (a scalar)
167 Args : on set, new value (a scalar or undef, optional)
170 =cut
172 sub source_tag{
173 my $self = shift;
175 return $self->{'source_tag'} = shift if @_;
176 return $self->{'source_tag'};
179 =head2 desc_tag
181 Title : desc_tag
182 Usage : $obj->desc_tag($newval)
183 Function: Get/Set the value used for the query motif. This will be placed in
184 the tag '-display_name'. Default is 'erpin' as set by the global
185 $DescTag. Use this to manually set the descriptor (motif searched for).
186 Since there is no way for this module to tell what the motif is from the
187 name of the descriptor file or the ERPIN output, this should
188 be set every time an ERPIN object is instantiated for clarity
189 Returns : value of exon_tag (a scalar)
190 Args : on set, new value (a scalar or undef, optional)
192 =cut
194 sub desc_tag{
195 my $self = shift;
197 return $self->{'desc_tag'} = shift if @_;
198 return $self->{'desc_tag'};
201 =head2 analysis_method
203 Usage : $obj->analysis_method();
204 Purpose : Inherited method. Overridden to ensure that the name matches
205 /ERPIN/i.
206 Returns : String
207 Argument : n/a
209 =cut
211 #-------------
212 sub analysis_method {
213 #-------------
214 my ($self, $method) = @_;
215 if($method && ($method !~ /ERPIN/i)) {
216 $self->throw("method $method not supported in " . ref($self));
218 return $self->SUPER::analysis_method($method);
221 =head2 next_feature
223 Title : next_feature
224 Usage : while($gene = $obj->next_feature()) {
225 # do something
227 Function: Returns the next gene structure prediction of the ERPIN result
228 file. Call this method repeatedly until FALSE is returned.
229 The returned object is actually a SeqFeatureI implementing object.
230 This method is required for classes implementing the
231 SeqAnalysisParserI interface, and is merely an alias for
232 next_prediction() at present.
233 Returns : A Bio::Tools::Prediction::Gene object.
234 Args : None (at present)
236 =cut
238 sub next_feature {
239 my ($self,@args) = @_;
240 # even though next_prediction doesn't expect any args (and this method
241 # does neither), we pass on args in order to be prepared if this changes
242 # ever
243 return $self->next_prediction(@args);
246 =head2 next_prediction
248 Title : next_prediction
249 Usage : while($gene = $obj->next_prediction()) {
250 # do something
252 Function: Returns the next gene structure prediction of the ERPIN result
253 file. Call this method repeatedly until FALSE is returned.
254 Returns : A Bio::Tools::Prediction::Gene object.
255 Args : None (at present)
257 =cut
259 sub next_prediction {
260 my ($self) = @_;
261 my ($motiftag,$srctag,$desctag) = ( $self->motif_tag,
262 $self->source_tag,
263 $self->desc_tag);
264 # hit vars
265 my ($strand, $start, $end, $sequence, $logodds, $score)=0;
266 while($_ = $self->_readline) {
267 #skip blank lines
268 next if /^\s+$/;
269 # parse header; there's probably a better way to do this, perhaps by
270 # mapping, but this works for now...
271 if(/^Training set:\s+\"(.*)\":$/) {
272 $self->{'_tset'}=$1;
274 elsif(/\s+(\d+ sequences of length \d+)/){
275 $self->{'_tset_descr'}=$1;
277 elsif(/^Cutoff:\s+(\S+)\s+$/) {
278 $self->{'_cutoff'}=$1;
280 elsif(/^Database:\s+\"(.*)\"$/) {
281 $self->{'_db'}=$1;
283 elsif(/^\s+(\d+ nucleotides to be processed in \d+ sequence)$/) {
284 $self->{'_db_desc'}=$1;
286 elsif(/^\s+ATGC ratios:\s(\d.\d+)\s+(\d.\d+)\s+(\d.\d+)\s+(\d.\d+)$/) {
287 my $atgc=sprintf("A=%0.3f T=%0.3f G=%0.3f C=%0.3f", $1, $2, $3, $4);
288 $self->{'_db_ratios'}=$atgc;
290 elsif(/^E-value at cutoff \S+ for \S+(?:G|M|k)?b double strand data: (\S+)/) {
291 $self->{'_eval_cutoff'}=$1;
293 # catch hit, store in private hash keys
294 elsif (/^>(.*)/) {
295 $self->{_seq_desc} = $1;
296 if($self->{_seq_desc} =~
297 /(?:P<db>gb|gi|emb|dbj|sp|pdb|bbs|ref|lcl)\|(\d+)((?:\:|\|)\w+\|(\S*.\d+)\|)?/) {
298 $self->{_seqid} = $1; # pulls out gid
299 $self->{_seq_acc} = $3;
300 } else {
301 $self->{_seqid} = $self->{_seq_desc};
302 $self->{_seq_acc} = '';
305 # parse next hit
306 elsif (/^(FW|RC)\s+\d+\s+(\d+)..(\d+)\s+(\d+.\d+)\s+(.*)/) {
307 ($strand, $start, $end, $logodds, $score)=($1, $2, $3, $4, $5);
308 $score =~ s/^e/1e/i;
309 chomp ($sequence = $self->_readline); # grab next line, which is the sequence hit
310 my $gene = Bio::SeqFeature::Generic->new(-seq_id => $self->{_seqid},
311 -start => $start,
312 -end => $end,
313 -strand => $strand eq 'FW' ? 1 : -1,
314 -score => $score,
315 -primary_tag => $motiftag,
316 -source_tag => $srctag,
317 -display_name => $desctag,
318 -tag => {
319 'tset' => $self->{_tset},
320 'tsetdesc' => $self->{_tset_descr},
321 'cutoff' => $self->{_cutoff},
322 'database' => $self->{_db},
323 'dbdesc' => $self->{_db_desc},
324 'dbratios' => $self->{_db_ratios},
325 'descline' => $self->{_seq_desc},
326 'accession' => $self->{_seq_acc},
327 'logodds' => $logodds,
328 'sequence' => $sequence}
330 return $gene;
332 #else {
333 # $self->debug("unrecognized line: $_");