sync w/ main trunk
[bioperl-live.git] / Bio / Tools / Analysis / Protein / Domcut.pm
blob7091321e99647eb924af8a363274a0fb41da3e83
1 # $Id: Domcut.pm,v 1.0 2003/07/ 11
3 # BioPerl module for Bio::Tools::Analysis::Protein::Domcut
5 # Copyright Richard Adams
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Tools::Analysis::Protein::Domcut - a wrapper around Domcut server
15 =head1 SYNOPSIS
17 use Bio::Tools::Analysis::Protein::Domcut;
18 #get a Bio::PrimarySeq
19 use Bio::PrimarySeq;
20 my $seq = Bio::PrimarySeq->new
21 (-seq=>'IKLCVNLAILAKAHLIELALAL',
22 -primary_id=>'test'); # a Bio::PrimarySeqI object
24 my $domcut = Bio::Tools::Analysis::Protein::Domcut->new (-seq=>$seq);
25 $domcut->run;
26 print $domcut->result;# #raw text to standard out
28 =head1 DESCRIPTION
30 A module to remotely retrieve predictions of protein domain
31 boundaries. Each residue in the protein receives a score, those
32 better than the significance threshold and at a local minimum receive
33 a rank - i.e., the best minimum is rank 1, the second best minimum is
34 rank2 etc. These correspond to domain boundaries. e.g.,
36 my $analysis_object = Bio::Tools::Analysis::Protein::Domcut->new
37 (-seq => $seq);
39 creates a new object. The sequence supplied must be a Bio::PrimarySeq and not
40 a Bio::Seq object.
42 $analysis_object->run;
44 submits the query to the server and obtains raw text output
46 Given an amino acid sequence the results can be obtained in 4 formats,
47 determined by the argument to the result method
49 =over 4
51 =item 1
53 The raw text of the program output
55 my $rawdata = $analysis_object->result;
57 =item 2
59 A reference to an array of hashes of scores for each state and the
60 assigned state. Each element in the array is a residue (indexed from 0).
62 my $data_ref = $analysis_object->result('parsed');
63 print "score for helix at residue 2 is $data_ref->[1]{'helix'}\n";
64 print "predicted struc at residue 2 is $data_ref->[1]{'struc}\n";
66 =item 3
68 An array of Bio::SeqFeature::Generic objects where each feature is a
69 predicted unit of secondary structure. Only stretches of helix/sheet
70 predictions for longer than 4 residues are defined as helices.
71 So, in order to add features to an existing Bio::Seq object;
73 # get a Bio::Seq object
74 my $seqobj;
75 my $tool = Bio::Tools::Analysis::Protein::Domcut->new
76 ( -seq => $seqobj->primary_seq);
77 $tool->run;
79 my @fts = $tool->result(Bio::SeqFeatureI);
81 $seqobj->add_SeqFeature(@fts);
83 # if you want meta sequences as well :
84 my $meta = $tool->result('meta');
85 $seqobj->primary_seq($meta);
87 # can access meta data in a Bio::Seq object via a
88 # call to primary_seq:
90 print $seq4->primary_seq->named_submeta_text('Domcut', 1,2), "\n";
92 =item 4
94 A Bio::Seq::Meta::Array implementing sequence.
96 This is a Bio::Seq object that can also hold data about each residue
97 in the sequence. In this case, the sequence can be associated with a
98 single array of Domcut prediction scores. e.g.,
100 my $meta_sequence = $analysis_object->result('meta');
101 print "scores from residues 10 -20 are ",
102 $meta_sequence->submeta_text(10,20), "\n";
104 Many methods common to all analyses are inherited from
105 Bio::Tools::Analysis::SimpleAnalysisBase.
107 =back
109 =head1 SEE ALSO
111 L<Bio::SimpleAnalysisI>,
112 L<Bio::Tools::Analysis::SimpleAnalysisBase>,
113 L<Bio::Seq::Meta::Array>,
114 L<Bio::WebAgent>
116 =head1 FEEDBACK
118 =head2 Mailing Lists
120 User feedback is an integral part of the evolution of this and other
121 Bioperl modules. Send your comments and suggestions preferably to one
122 of the Bioperl mailing lists. Your participation is much appreciated.
124 bioperl-l@bioperl.org - General discussion
125 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
127 =head2 Support
129 Please direct usage questions or support issues to the mailing list:
131 L<bioperl-l@bioperl.org>
133 rather than to the module maintainer directly. Many experienced and
134 reponsive experts will be able look at the problem and quickly
135 address it. Please include a thorough description of the problem
136 with code and data examples if at all possible.
138 =head2 Reporting Bugs
140 Report bugs to the Bioperl bug tracking system to help us keep track
141 the bugs and their resolution. Bug reports can be submitted via the
142 web:
144 http://bugzilla.open-bio.org/
146 =head1 AUTHORS
148 Richard Adams, Richard.Adams@ed.ac.uk,
150 =head1 APPENDIX
152 The rest of the documentation details each of the object
153 methods. Internal methods are usually preceded with a _
156 =cut
159 use strict;
160 package Bio::Tools::Analysis::Protein::Domcut;
161 use IO::String;
162 use Bio::SeqIO;
163 use HTTP::Request::Common qw(GET);
164 use Bio::SeqFeature::Generic;
165 use Bio::Seq::Meta::Array;
167 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
169 my $URL = 'http://www.Bork.EMBL-Heidelberg.DE/Docu/mikita/domplot.cgi?';
170 my $ANALYSIS_NAME = 'Domcut';
171 my $ANALYSIS_SPEC =
173 'name' => 'Domcut',
174 'type' => 'protein', #compulsory entry as is used for seq checking
175 'version' => 'n/a',
176 'supplier' => 'Ohara lab, Laboratory of DNA technology,
177 Kazusa DNA Research Institute, 1532-3 Yana,
178 Kisarazu, Japan',
179 'description' => 'to predict domain boundaries in proteins',
180 'reference' => 'Bioinformatics 19, 673-674 (2003)',
184 my $INPUT_SPEC =
187 'mandatory' => 'true',
188 'type' => 'Bio::PrimarySeqI',
189 'name' => 'seq',
193 my $RESULT_SPEC =
195 '' => 'bulk', # same as undef
196 'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
197 'parsed' => "Array of {'score' =>, 'rank'=> ]",
198 'meta' => 'Bio::Seq::Meta::Array object'
201 =head2 result
203 Name : result
204 Purpose : To retrieve results of analysis in one of several formats.
205 Usage : $job->result (...)
206 Returns : a result created by running an analysis
207 Args : various - see keysin $RESULT_SPEC.
209 The method returns a result of an executed job. If the job was
210 terminated by an error the result may contain an error message instead
211 of the real data.
213 This implementation returns differently processed data depending on
214 argument:
216 =over 3
218 =item undef
220 Returns the raw ASCII data stream but without HTML tags
222 =item 'Bio::SeqFeatureI'
224 The argument string defines the type of bioperl objects returned in an
225 array. The objects are L<Bio::SeqFeature::Generic>. Tagnames are 'score'
226 and 'rank'.
228 =item 'parsed'
230 Array of array references of [score, rank].
232 =item 'all'
234 A Bio::Seq::Meta::Array object. Scores can be accessed using methods
235 from this class. Meta sequence name is Domcut.
237 =back
239 =cut
242 sub result {
243 my ($self,$value) = @_;
244 my @scores;
245 my @fts;
247 if ($value ) {
248 # parse raw text if not already done so
249 if (!exists($self->{'_parsed'})) {
250 my $result = IO::String->new($self->{'_result'});
251 while (my $line = <$result>) {
252 next if $line =~/#/;
253 $line =~/(\-?\d\.\d+)\s+(\d+)?/;
254 push @scores, {score => $1,
255 rank => ($2)?$2:'' ,
258 #hold parsed results in object, saves having to reparse each time
259 $self->{'_parsed'} = \@scores;
261 #make aarray of Bio::SeqFeature::Generic objects
262 if ($value eq 'Bio::SeqFeatureI') {
263 my $i = 0; #array index (= aa num -1)
264 my $in_trough = 0;
265 my ($st, $end, $rank, $min_score, $min_locus) = (0,0,0,0,0);
266 my $seqlen = $self->seq->length();
267 for my $score (@{$self->{'_parsed'}}) {
269 ##start a potential trough
270 if ($in_trough == 0 && $score->{'score'} < -0.09) {
271 $in_trough = 1;
272 $st = $i+1;
275 ## in a trough, is it ranked?
276 elsif ( $in_trough == 1 && $score->{'score'} < -0.09 && $i +1 < $seqlen){
277 if ($score->{'rank'} ) {
278 $rank = $score->{'rank'};
279 $min_score = $score->{'score'};
280 $min_locus = $i + 1;
284 ## end of trough or end of sequence, make into feature
285 ## if possible
286 elsif ($in_trough == 1 && ($score->{'score'} > -0.09 ||
287 $i +1 == $seqlen) ){
288 if ($rank != 0) {
289 push @fts, Bio::SeqFeature::Generic->new (
290 -start => $st,
291 -end => $i +1, #current position
292 -primary => 'Linker',
293 -source => 'Domcut',
294 -tag => {
295 score => $min_score,
296 rank => $rank,
297 residue => $min_locus,
301 ##and reset parameters ##
302 ($st, $in_trough, $min_locus, $min_score, $rank) = (0,0,0,0,0);
304 $i++;
306 return @fts;
308 ## convert parsed data into a meta array format
309 elsif ($value eq 'meta') {
311 ## only need to bless once
312 if (! $self->seq->isa("Bio::Seq::MetaI")){
313 bless ($self->seq, "Bio::Seq::Meta::Array");
315 $self->seq->isa("Bio::Seq::MetaI")
316 || $self->throw("$self is not a Bio::Seq::MetaI");
317 my $meta_name = "Domcut";
319 #test that sequence does not have already a meta seq with same name
320 if (grep{$_ eq $meta_name}$self->seq->meta_names ) {
321 $self->warn ("$meta_name already exists , not overwriting!");
322 next;
325 ### or should be an instance variable?? ##
326 $Bio::Seq::Meta::Array::DEFAULT_NAME = 'Domcut';
327 my @meta = map{$_->{'score'}} @{$self->{'_parsed'}};
328 $self->seq->named_meta($meta_name,\@meta );
330 # return seq array object implementing meta sequence #
331 return $self->seq;
334 # return ref to array of predictions;
335 elsif ($value eq 'parsed') {
336 return $self->{'_parsed'};
339 #else if no arguments return raw text
340 return $self->{'_result'};
343 sub _init {
344 my $self = shift;
345 $self->url($URL);
346 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
347 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
348 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
349 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_NAME;
350 return $self;
353 sub _run {
354 my $self = shift;
355 my $seq_fasta = $self->seq->seq;
356 $self->delay(1);
357 # delay repeated calls by default by 3 sec, set delay() to change
358 $self->sleep;
359 $self->status('TERMINATED_BY_ERROR');
360 my $rqst = GET $self->url . "&seqnam=". "&sequence=".
361 $seq_fasta. "&outform=dat";
363 my $content = $self->request($rqst);
364 my $text = $content->content; #1st reponse
365 $self->{'_result'} = $text;
366 $self->status('COMPLETED') if $text ne '';