3 # BioPerl module for Bio::Tools::ERPIN
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tools::ERPIN - a parser for ERPIN output
19 use Bio::Tools::ERPIN;
20 my $parser = Bio::Tools::ERPIN->new( -file => $rna_output,
21 -motiftag => 'protein_bind'
22 -desctag => 'TRAP_binding');
24 while( my $motif = $parser->next_prediction) {
30 Parses raw ERPIN output.
32 This module is not currently complete. As is, it will parse raw
33 ERPIN long format output and pack information into
34 Bio::SeqFeature::Generic objects.
36 Several values have also been added in the 'tag' hash. These can be
37 accessed using the following syntax:
39 my ($entry) = $feature->get_Annotations('SecStructure');
42 tset - training set used for the sequence
43 tsetdesc - training set description line
44 cutoff - cutoff value used
45 database - name of database
46 dbdesc - description of database
47 dbratios - nucleotide ratios of database (used to calculate evalue)
48 descline - entire description line (in case the regex used for
49 sequence ID doesn't adequately catch the name
50 accession - accession number of sequence (if present)
51 logodds - logodds score value
52 sequence - sequence from hit, separated based on training set
54 See t/ERPIN.t for example usage.
56 At some point a more complicated feature object may be used to support
57 this data rather than forcing most of the information into tag/value
58 pairs in a SeqFeature::Generic. This will hopefully allow for more
59 flexible analysis of data (specifically RNA secondary structural
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to
68 the Bioperl mailing list. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 Report bugs to the Bioperl bug tracking system to help us keep track
76 of the bugs and their resolution. Bug reports can be submitted via the
79 http://bugzilla.open-bio.org/
81 =head1 AUTHOR - Chris Fields
83 Email cjfields-at-uiuc-dot-edu
87 The rest of the documentation details each of the object methods.
88 Internal methods are usually preceded with a _
92 # Let the code begin...
94 package Bio
::Tools
::ERPIN
;
97 use Bio
::SeqFeature
::Generic
;
99 use base
qw(Bio::Tools::AnalysisResult);
101 our($MotifTag,$SrcTag,$DescTag) = qw(misc_binding ERPIN erpin);
106 Usage : my $obj = Bio::Tools::ERPIN->new();
107 Function: Builds a new Bio::Tools::ERPIN object
108 Returns : an instance of Bio::Tools::ERPIN
109 Args : -fh/-file for input filename
110 -motiftag => primary tag used in gene features (default 'misc_binding')
111 -desctag => tag used for display_name name (default 'erpin')
112 -srctag => source tag used in all features (default 'ERPIN')
117 my($self,@args) = @_;
118 $self->warn('Use of this module is deprecated. Use Bio::SearchIO::erpin instead');
119 $self->SUPER::_initialize
(@args);
120 my ($motiftag,$desctag,$srctag) = $self->SUPER::_rearrange
([qw(MOTIFTAG
125 $self->motif_tag(defined $motiftag ?
$motiftag : $MotifTag);
126 $self->source_tag(defined $srctag ?
$srctag : $SrcTag);
127 $self->desc_tag(defined $desctag ?
$desctag : $DescTag);
128 foreach (qw(_tset _tset_desc _cutoff _db _db_desc
129 _db_ratios _eval_cutoff _seqid _secacc _seqdesc )) {
137 Usage : $obj->motiftag($newval)
138 Function: Get/Set the value used for 'motif_tag', which is used for setting the
140 Default is 'misc_binding' as set by the global $MotifTag.
141 'misc_binding' is used here because a conserved RNA motif is capable
142 of binding proteins (regulatory proteins), antisense RNA (siRNA),
143 small molecules (riboswitches), or nothing at all (tRNA,
144 terminators, etc.). It is recommended that this be changed to other
145 tags ('misc_RNA', 'protein_binding', 'tRNA', etc.) where appropriate.
146 For more information, see:
147 http://www.ncbi.nlm.nih.gov/collab/FT/index.html
148 Returns : value of motif_tag (a scalar)
149 Args : on set, new value (a scalar or undef, optional)
156 return $self->{'motif_tag'} = shift if @_;
157 return $self->{'motif_tag'};
163 Usage : $obj->source_tag($newval)
164 Function: Get/Set the value used for the 'source_tag'.
165 Default is 'ERPIN' as set by the global $SrcTag
166 Returns : value of source_tag (a scalar)
167 Args : on set, new value (a scalar or undef, optional)
175 return $self->{'source_tag'} = shift if @_;
176 return $self->{'source_tag'};
182 Usage : $obj->desc_tag($newval)
183 Function: Get/Set the value used for the query motif. This will be placed in
184 the tag '-display_name'. Default is 'erpin' as set by the global
185 $DescTag. Use this to manually set the descriptor (motif searched for).
186 Since there is no way for this module to tell what the motif is from the
187 name of the descriptor file or the ERPIN output, this should
188 be set every time an ERPIN object is instantiated for clarity
189 Returns : value of exon_tag (a scalar)
190 Args : on set, new value (a scalar or undef, optional)
197 return $self->{'desc_tag'} = shift if @_;
198 return $self->{'desc_tag'};
201 =head2 analysis_method
203 Usage : $obj->analysis_method();
204 Purpose : Inherited method. Overridden to ensure that the name matches
212 sub analysis_method
{
214 my ($self, $method) = @_;
215 if($method && ($method !~ /ERPIN/i)) {
216 $self->throw("method $method not supported in " . ref($self));
218 return $self->SUPER::analysis_method
($method);
224 Usage : while($gene = $obj->next_feature()) {
227 Function: Returns the next gene structure prediction of the ERPIN result
228 file. Call this method repeatedly until FALSE is returned.
229 The returned object is actually a SeqFeatureI implementing object.
230 This method is required for classes implementing the
231 SeqAnalysisParserI interface, and is merely an alias for
232 next_prediction() at present.
233 Returns : A Bio::Tools::Prediction::Gene object.
234 Args : None (at present)
239 my ($self,@args) = @_;
240 # even though next_prediction doesn't expect any args (and this method
241 # does neither), we pass on args in order to be prepared if this changes
243 return $self->next_prediction(@args);
246 =head2 next_prediction
248 Title : next_prediction
249 Usage : while($gene = $obj->next_prediction()) {
252 Function: Returns the next gene structure prediction of the ERPIN result
253 file. Call this method repeatedly until FALSE is returned.
254 Returns : A Bio::Tools::Prediction::Gene object.
255 Args : None (at present)
259 sub next_prediction
{
261 my ($motiftag,$srctag,$desctag) = ( $self->motif_tag,
265 my ($strand, $start, $end, $sequence, $logodds, $score)=0;
266 while($_ = $self->_readline) {
269 # parse header; there's probably a better way to do this, perhaps by
270 # mapping, but this works for now...
271 if(/^Training set:\s+\"(.*)\":$/) {
274 elsif(/\s+(\d+ sequences of length \d+)/){
275 $self->{'_tset_descr'}=$1;
277 elsif(/^Cutoff:\s+(\S+)\s+$/) {
278 $self->{'_cutoff'}=$1;
280 elsif(/^Database:\s+\"(.*)\"$/) {
283 elsif(/^\s+(\d+ nucleotides to be processed in \d+ sequence)$/) {
284 $self->{'_db_desc'}=$1;
286 elsif(/^\s+ATGC ratios:\s(\d.\d+)\s+(\d.\d+)\s+(\d.\d+)\s+(\d.\d+)$/) {
287 my $atgc=sprintf("A=%0.3f T=%0.3f G=%0.3f C=%0.3f", $1, $2, $3, $4);
288 $self->{'_db_ratios'}=$atgc;
290 elsif(/^E-value at cutoff \S+ for \S+(?:G|M|k)?b double strand data: (\S+)/) {
291 $self->{'_eval_cutoff'}=$1;
293 # catch hit, store in private hash keys
295 $self->{_seq_desc
} = $1;
296 if($self->{_seq_desc
} =~
297 /(?:P<db>gb|gi|emb|dbj|sp|pdb|bbs|ref|lcl)\|(\d+)((?:\:|\|)\w+\|(\S*.\d+)\|)?/) {
298 $self->{_seqid
} = $1; # pulls out gid
299 $self->{_seq_acc
} = $3;
301 $self->{_seqid
} = $self->{_seq_desc
};
302 $self->{_seq_acc
} = '';
306 elsif (/^(FW|RC)\s+\d+\s+(\d+)..(\d+)\s+(\d+.\d+)\s+(.*)/) {
307 ($strand, $start, $end, $logodds, $score)=($1, $2, $3, $4, $5);
309 chomp ($sequence = $self->_readline); # grab next line, which is the sequence hit
310 my $gene = Bio
::SeqFeature
::Generic
->new(-seq_id
=> $self->{_seqid
},
313 -strand
=> $strand eq 'FW' ?
1 : -1,
315 -primary_tag
=> $motiftag,
316 -source_tag
=> $srctag,
317 -display_name
=> $desctag,
319 'tset' => $self->{_tset
},
320 'tsetdesc' => $self->{_tset_descr
},
321 'cutoff' => $self->{_cutoff
},
322 'database' => $self->{_db
},
323 'dbdesc' => $self->{_db_desc
},
324 'dbratios' => $self->{_db_ratios
},
325 'descline' => $self->{_seq_desc
},
326 'accession' => $self->{_seq_acc
},
327 'logodds' => $logodds,
328 'sequence' => $sequence}
333 # $self->debug("unrecognized line: $_");