sync w/ main trunk
[bioperl-live.git] / Bio / Tools / Analysis / Protein / Scansite.pm
blob53b24fb2df28686769f339397ae7712261097fe2
1 # $Id$
3 # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
9 # Copyright Richard Adams
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
19 =head1 SYNOPSIS
21 use Bio::Tools::Analysis::Protein::Scansite;
23 my $seq; # a Bio::PrimarySeqI object
25 my $tool = Bio::Tools::Analysis::Protein::Scansite->new
26 ( -seq => $seq->primary_seq );
28 # run Scansite prediction on a sequence
29 $tool->run();
31 # alternatively you can say
32 $tool->seq($seq->primary_seq)->run;
34 die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
36 print $tool->result; # print raw prediction to STDOUT
38 foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
40 # do something to SeqFeature
41 # e.g. print as GFF
42 print $feat->gff_string, "\n";
43 # or store within the sequence - if it is a Bio::RichSeqI
44 $seq->add_SeqFeature($feat);
48 =head1 DESCRIPTION
50 This class is a wrapper around the Scansite 2.0 server which produces
51 predictions for serine, threonine and tyrosine phosphorylation sites
52 in eukaryotic proteins. At present this is a basic wrapper for the
53 "Scan protein by input sequence" functionality, which takes a sequence
54 and searches for motifs, with the option to select the search
55 stringency. At present, searches for specific phosphorylation
56 sites are not supported; all predicted sites are returned.
58 =head2 Return formats
60 The Scansite results can be obtained in several formats:
62 =over 3
64 =item 1.
66 By calling
68 my $res = $tool->result('');
70 $res holds a string of the predicted sites in tabular format.
72 =item 2.
74 By calling
76 my $data_ref = $tool->result('value')
78 $data_ref is a reference to an array of hashes. Each element in the
79 array represents a predicted phosphorylation site. The hash keys are
80 the names of the data fields,i.e.,
82 'motif' => 'Casn_Kin1' # name of kinase
83 'percentile' => 0.155 # see Scansite docs
84 'position' => 9 # position in protein
85 'protein' => 'A1' # protein id
86 'score' => 0.3696 # see Scansite docs
87 'sequence' => 'ASYFDTASYFSADAT' # sequence surrounding site
88 'site' => 'S9' # phosphorylated residue
89 'zscore' => '-3.110' # see Scansite docs
91 =item 3.
93 By calling
95 my @fts = $tool->Result('Bio::SeqFeatureI');
97 which returns an array of L<Bio::SeqFeatureI> compliant objects with
98 primary tag value 'Site' and tag names of 'motif', 'score',
99 'sequence', 'zscore' as above.
101 =back
103 See L<http://scansite.mit.edu/>.
105 This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
106 write wrappers on various services. This class uses a web resource and
107 therefore inherits from L<Bio::WebAgent>.
109 =head1 SEE ALSO
111 L<Bio::SimpleAnalysisI>,
112 L<Bio::WebAgent>
114 =head1 FEEDBACK
116 =head2 Mailing Lists
118 User feedback is an integral part of the evolution of this and other
119 Bioperl modules. Send your comments and suggestions preferably to one
120 of the Bioperl mailing lists. Your participation is much appreciated.
122 bioperl-l@bioperl.org - General discussion
123 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
125 =head2 Support
127 Please direct usage questions or support issues to the mailing list:
129 L<bioperl-l@bioperl.org>
131 rather than to the module maintainer directly. Many experienced and
132 reponsive experts will be able look at the problem and quickly
133 address it. Please include a thorough description of the problem
134 with code and data examples if at all possible.
136 =head2 Reporting Bugs
138 Report bugs to the Bioperl bug tracking system to help us keep track
139 the bugs and their resolution. Bug reports can be submitted via the
140 web:
142 http://bugzilla.open-bio.org/
144 =head1 AUTHORS
146 Richard Adams, Richard.Adams@ed.ac.uk,
148 =head1 APPENDIX
150 The rest of the documentation details each of the object
151 methods. Internal methods are usually preceded with a _
153 =cut
156 # Let the code begin...
159 package Bio::Tools::Analysis::Protein::Scansite;
160 use vars qw($FLOAT @STRINGENCY);
161 use strict;
162 use IO::String;
163 use Bio::SeqIO;
164 use HTTP::Request::Common qw(POST);
165 use Bio::SeqFeature::Generic;
167 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
169 $FLOAT = '[+-]?\d*\.\d*';
170 @STRINGENCY = qw(High Medium Low);
171 my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
174 my $ANALYSIS_SPEC =
176 'name' => 'Scansite',
177 'type' => 'Protein',
178 'version' => '2.0',
179 'supplier' => 'Massachusetts Institute of Technology',
180 'description' => 'Prediction of serine, threonine and tyrosine
181 phosphorylation sites in eukaryotic proteins',
184 my $INPUT_SPEC =
187 'mandatory' => 'true',
188 'type' => 'Bio::PrimarySeqI',
189 'name' => 'seq',
192 'mandatory' => 'false',
193 'type' => 'text',
194 'name' => 'protein_id',
195 'default' => 'unnamed',
198 'mandatory' => 'false',
199 'type' => 'text',
200 'name' => 'stringency',
201 'default' => 'High',
205 my $RESULT_SPEC =
207 '' => 'bulk', # same as undef
208 'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
209 'raw' => 'Array of {motif=>, percentile=>, position=>,
210 protein=>, score=>, site=>, zscore=>
211 sequence=>
216 =head2 result
218 Name : result
219 Usage : $job->result (...)
220 Returns : a result created by running an analysis
221 Args : none (but an implementation may choose
222 to add arguments for instructions how to process
223 the raw result)
225 The method returns a scalar representing a result of an executed
226 job. If the job was terminated by an error, the result may contain
227 an error message instead of the real data.
229 This implementation returns differently processed data depending on
230 argument:
232 =over 3
234 =item undef
236 Returns the raw ASCII data stream but without HTML tags
238 =item 'Bio::SeqFeatureI'
240 The argument string defined the type of bioperl objects returned in an
241 array. The objects are L<Bio::SeqFeature::Generic>.
243 =item 'parsed'
245 Returns a reference to an array of hashes containing the data of one
246 phosphorylation site prediction. Key values are:
248 motif, percentile, position, protein, score, site, zscore, sequence.
251 =back
254 =cut
256 sub result {
257 my ($self,$value) = @_;
258 if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
259 $self->throw("Cannot get results, analysis not run!");
261 my @fts;
263 if ($value ) {
264 if ($value eq 'Bio::SeqFeatureI') {
265 for my $hit (@{$self->{'_parsed'}}) {
266 push @fts, Bio::SeqFeature::Generic->new(
267 -start => $hit->{'position'},
268 -end => $hit->{'position'},
269 -primary_tag => 'Site',
270 -source => 'Scansite',
271 -tag => {
272 score => $hit->{'score'},
273 zscore => $hit->{'zscore'},
274 motif => $hit->{'motif'},
275 site => $hit->{'site'},
276 sequence => $hit->{'sequence'},
280 return @fts;
282 elsif ($value eq 'meta') {
283 $self->throw("No meta sequences available in this analysis!");
285 ## else get here
286 return $self->{'_parsed'};
289 return $self->{'_result'};
292 =head2 stringency
294 Usage : $job->stringency(...)
295 Returns : The significance stringency of a prediction
296 Args : None (retrieves value) or 'High', 'Medium' or 'Low'.
297 Purpose : Get/setter of the stringency to be sumitted for analysis.
299 =cut
301 sub stringency {
302 my ($self,$value) = @_;
303 if( $value) {
304 if (! grep{$_=~ /$value/i}@STRINGENCY ) {
305 $self->throw("I need a stringency of [".
306 join " ", @STRINGENCY .
307 "], not [$value]");
309 $self->{'_stringency'} = $value;
310 return $self;
312 return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
315 =head2 protein_id
317 Usage : $job->protein_id(...)
318 Returns : The sequence id of the protein or 'unnamed' if not set.
319 Args : None
320 Purpose : Getter of the seq_id. Returns the display_id of the sequence
321 object.
323 =cut
325 sub protein_id {
326 my $self = shift;
327 return defined ($self->seq())? $self->seq->display_id()
328 : $self->input_spec->[1]{'default'};
331 sub _init
333 my $self = shift;
334 $self->url($URL);
335 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
336 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
337 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
338 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
339 return $self;
342 sub _run {
343 my $self = shift;
345 # format the sequence into fasta
346 $self->delay(1);
347 # delay repeated calls by default by 3 sec, set delay() to change
348 $self->sleep;
350 $self->status('TERMINATED_BY_ERROR');
352 my $request = POST $self->url,
353 Content => [sequence => $self->seq->seq(),
354 protein_id => $self->protein_id(),
355 motif_option => 'all',
356 motifs => '',
357 motif_groups => '',
358 stringency => $self->stringency(),
359 domain_flag => '',
360 submit => "Submit Request",
362 ## raw html report,
363 my $content = $self->request($request);
364 my $text = $content->content;
366 ##access result data from tag in html
367 my @parsed_Results = ();
368 my @unwantedParams = qw(db source class);
369 my @results = split /sitestats\.phtml\?/, $text;
370 shift @results;
372 ##this module generates 'parsed' output directly from html,
373 ## avoids having toparse twice.
375 for my $hit (@results) {
376 ## get results string
377 my ($res) = $hit =~ /^(.+?)"/;
379 #get key value pairs
380 my %params = $res =~/(\w+)=([^&]+)/g;
382 ##remove unwanted data from hash
383 map{delete $params{$_}} @unwantedParams;
384 push @parsed_Results, \%params;
387 ## now generate text output in table format
388 my $out_Str = '';
389 $out_Str .= $self->_make_header(\@parsed_Results);
390 $out_Str .= $self->_add_data(\@parsed_Results);
393 $self->{'_result'} = $out_Str;
394 $self->{'_parsed'} = \@parsed_Results;
396 ## is successsful if there are results or if there are no results and
397 ## this beacuse there are no matches, not because of parsing errors etc.
398 $self->status('COMPLETED') if $text ne '' &&
399 (scalar @results > 0 ||
400 (scalar @results == 0 && $text =~/No sites found/));
401 if ($text =~ /server\s+error/i) {
402 $self->warn("There was an internal server error !- text below") ;
403 $self->warn($text);
404 return;
408 sub _process_arguments {
410 # extra checking for sequence length
411 # mitoprot specific argument testing
412 my ($self, $args) = @_;
413 #use base checking for existence of mandatory fields
414 $self->SUPER::_process_arguments($args);
416 # specific requirements
417 $self->throw("Sequence must be > 15 amino acids long!")
418 if $self->seq->length < 15;
419 $self->throw("Sequence must be protein")
420 unless $self->seq->alphabet() eq 'protein';
423 sub _make_header {
424 my ($self, $res) = @_;
425 my $header = '';
426 for my $k (sort keys %{$res->[0]} ){
427 next if $k eq 'sequence';
428 $header .= $k;
429 $header .= ' 'x(12 -length($k));
431 $header .= "sequence\n\n";
432 return $header;
435 sub _add_data {
436 my ($self, $res) = @_;
437 my $outstr = '';
438 for my $hit (@$res) {
439 for my $k (sort keys %$hit ){
440 next if $k eq 'sequence';
441 $outstr .= $hit->{$k};
442 $outstr .= ' 'x(12 - length($hit->{$k}));
444 $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
446 return $outstr;