Bio/Tools/Analysis/Protein/Scansite.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
   8 #
   9 # Copyright Richard Adams
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
  18
  19 =head1 SYNOPSIS
  20
  21   use Bio::Tools::Analysis::Protein::Scansite;
  22
  23   my $seq; # a Bio::PrimarySeqI object
  24
  25   my $tool = Bio::Tools::Analysis::Protein::Scansite->new
  26      ( -seq => $seq->primary_seq );
  27
  28   # run Scansite prediction on a sequence
  29   $tool->run();
  30
  31   # alternatively you can say
  32   $tool->seq($seq->primary_seq)->run;
  33
  34   die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
  35
  36   print $tool->result;     # print raw prediction to STDOUT
  37
  38   foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
  39
  40       # do something to SeqFeature
  41       # e.g. print as GFF
  42       print $feat->gff_string, "\n";
  43       # or store within the sequence - if it is a Bio::RichSeqI
  44       $seq->add_SeqFeature($feat);
  45
  46  }
  47
  48 =head1 DESCRIPTION
  49
  50 This class is a wrapper around the Scansite 2.0 server which produces
  51 predictions for serine, threonine and tyrosine phosphorylation sites
  52 in eukaryotic proteins. At present this is a basic wrapper for the
  53 "Scan protein by input sequence" functionality, which takes a sequence
  54 and searches for motifs, with the option to select the search
  55 stringency. At present, searches for specific phosphorylation
  56 sites are not supported; all predicted sites are returned.
  57
  58 =head2 Return formats
  59
  60 The Scansite results can be obtained in several formats:
  61
  62 =over 3
  63
  64 =item 1.
  65
  66 By calling
  67
  68   my $res = $tool->result('');
  69
  70 $res holds a string of the predicted sites in tabular format.
  71
  72 =item 2.
  73
  74 By calling
  75
  76   my $data_ref = $tool->result('value')
  77
  78 $data_ref is a reference to an array of hashes. Each element in the
  79 array represents a predicted phosphorylation site. The hash keys are
  80 the names of the data fields,i.e.,
  81
  82     'motif'      => 'Casn_Kin1'       # name of kinase
  83     'percentile' => 0.155             # see Scansite docs
  84     'position'   => 9                 # position in protein
  85     'protein'    => 'A1'              # protein id
  86     'score'      => 0.3696            # see Scansite docs
  87     'sequence'   => 'ASYFDTASYFSADAT' # sequence surrounding site
  88     'site'       => 'S9'              # phosphorylated residue
  89     'zscore'     => '-3.110'          # see Scansite docs
  90
  91 =item 3.
  92
  93 By calling
  94
  95   my @fts = $tool->Result('Bio::SeqFeatureI');
  96
  97 which returns an array of L<Bio::SeqFeatureI> compliant objects with
  98 primary tag value 'Site' and tag names of 'motif', 'score',
  99 'sequence', 'zscore' as above.
 100
 101 =back
 102
 103 See L<http://scansite.mit.edu/>.
 104
 105 This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
 106 write wrappers on various services. This class uses a web resource and
 107 therefore inherits from L<Bio::WebAgent>.
 108
 109 =head1 SEE ALSO
 110
 111 L<Bio::SimpleAnalysisI>,
 112 L<Bio::WebAgent>
 113
 114 =head1 FEEDBACK
 115
 116 =head2 Mailing Lists
 117
 118 User feedback is an integral part of the evolution of this and other
 119 Bioperl modules. Send your comments and suggestions preferably to one
 120 of the Bioperl mailing lists.  Your participation is much appreciated.
 121
 122   bioperl-l@bioperl.org                  - General discussion
 123   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 124
 125 =head2 Support
 126
 127 Please direct usage questions or support issues to the mailing list:
 128
 129 L<bioperl-l@bioperl.org>
 130
 131 rather than to the module maintainer directly. Many experienced and
 132 reponsive experts will be able look at the problem and quickly
 133 address it. Please include a thorough description of the problem
 134 with code and data examples if at all possible.
 135
 136 =head2 Reporting Bugs
 137
 138 Report bugs to the Bioperl bug tracking system to help us keep track
 139 the bugs and their resolution.  Bug reports can be submitted via the
 140 web:
 141
 142   http://bugzilla.open-bio.org/
 143
 144 =head1 AUTHORS
 145
 146 Richard Adams, Richard.Adams@ed.ac.uk,
 147
 148 =head1 APPENDIX
 149
 150 The rest of the documentation details each of the object
 151 methods. Internal methods are usually preceded with a _
 152
 153 =cut
 154
 155
 156 # Let the code begin...
 157
 158
 159 package Bio::Tools::Analysis::Protein::Scansite;
 160 use vars qw($FLOAT @STRINGENCY);
 161 use strict;
 162 use IO::String;
 163 use Bio::SeqIO;
 164 use HTTP::Request::Common qw(POST);
 165 use Bio::SeqFeature::Generic;
 166
 167 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
 168
 169 $FLOAT = '[+-]?\d*\.\d*';
 170 @STRINGENCY = qw(High Medium Low);
 171 my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
 172
 173
 174     my $ANALYSIS_SPEC =
 175         {
 176          'name'        => 'Scansite',
 177          'type'        => 'Protein',
 178          'version'     => '2.0',
 179          'supplier'    => 'Massachusetts Institute of Technology',
 180          'description' => 'Prediction of serine, threonine and tyrosine
 181                              phosphorylation sites in eukaryotic proteins',
 182         };
 183
 184     my $INPUT_SPEC =
 185         [
 186          {
 187           'mandatory' => 'true',
 188           'type'      => 'Bio::PrimarySeqI',
 189           'name'      => 'seq',
 190          },
 191          {
 192           'mandatory' => 'false',
 193           'type'      => 'text',
 194           'name'      => 'protein_id',
 195           'default'   => 'unnamed',
 196          },
 197          {
 198           'mandatory' => 'false',
 199           'type'      => 'text',
 200           'name'      => 'stringency',
 201           'default'   => 'High',
 202          },
 203         ];
 204
 205     my $RESULT_SPEC =
 206         {
 207          ''                 => 'bulk',  # same as undef
 208          'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
 209          'raw'              => 'Array of {motif=>, percentile=>, position=>,
 210                                           protein=>, score=>, site=>, zscore=>
 211                                           sequence=>
 212                                          }',
 213         };
 214
 215
 216 =head2 result
 217
 218  Name    : result
 219  Usage   : $job->result (...)
 220  Returns : a result created by running an analysis
 221  Args    : none (but an implementation may choose
 222            to add arguments for instructions how to process
 223            the raw result)
 224
 225 The method returns a scalar representing a result of an executed
 226 job. If the job was terminated by an error, the result may contain
 227 an error message instead of the real data.
 228
 229 This implementation returns differently processed data depending on
 230 argument:
 231
 232 =over 3
 233
 234 =item undef
 235
 236 Returns the raw ASCII data stream but without HTML tags
 237
 238 =item 'Bio::SeqFeatureI'
 239
 240 The argument string defined the type of bioperl objects returned in an
 241 array.  The objects are L<Bio::SeqFeature::Generic>.
 242
 243 =item 'parsed'
 244
 245 Returns a reference to an array of hashes containing the data of one
 246 phosphorylation site prediction. Key values are:
 247
 248 motif, percentile, position, protein, score, site, zscore,  sequence.
 249
 250
 251 =back
 252
 253
 254 =cut
 255
 256 sub result {
 257     my ($self,$value) = @_;
 258         if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
 259                 $self->throw("Cannot get results, analysis not run!");
 260                 }
 261     my @fts;
 262
 263     if ($value ) {
 264                 if ($value eq 'Bio::SeqFeatureI') {
 265                         for my $hit (@{$self->{'_parsed'}}) {
 266                                 push @fts, Bio::SeqFeature::Generic->new(
 267                                         -start       => $hit->{'position'},
 268                                         -end         => $hit->{'position'},
 269                                     -primary_tag => 'Site',
 270                                         -source      => 'Scansite',
 271                                         -tag => {
 272                                                 score     => $hit->{'score'},
 273                                                 zscore    => $hit->{'zscore'},
 274                                                 motif     => $hit->{'motif'},
 275                                                 site      => $hit->{'site'},
 276                                                 sequence  => $hit->{'sequence'},
 277                                                         },
 278                                 );
 279                         }
 280                         return @fts;
 281                 }
 282                 elsif ($value eq 'meta') {
 283                         $self->throw("No meta sequences available in this analysis!");
 284                         }
 285                 ## else get here
 286                 return $self->{'_parsed'};
 287     }
 288
 289     return $self->{'_result'};
 290 }
 291
 292 =head2  stringency
 293
 294  Usage    : $job->stringency(...)
 295  Returns  : The significance stringency of a prediction
 296  Args     : None (retrieves value) or 'High', 'Medium' or 'Low'.
 297  Purpose  : Get/setter of the stringency to be sumitted for analysis.
 298
 299 =cut
 300
 301 sub stringency {
 302    my ($self,$value) = @_;
 303    if( $value) {
 304        if (! grep{$_=~ /$value/i}@STRINGENCY ) {
 305            $self->throw("I need a stringency of [".
 306                                                 join " ", @STRINGENCY    .
 307                                                 "], not [$value]");
 308        }
 309        $self->{'_stringency'} = $value;
 310        return $self;
 311    }
 312    return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
 313 }
 314
 315 =head2  protein_id
 316
 317  Usage    : $job->protein_id(...)
 318  Returns  : The sequence id of the protein or 'unnamed' if not set.
 319  Args     : None
 320  Purpose  : Getter of the seq_id. Returns the display_id of the sequence
 321             object.
 322
 323 =cut
 324
 325 sub protein_id {
 326         my $self = shift;
 327         return defined ($self->seq())? $self->seq->display_id()
 328                                      : $self->input_spec->[1]{'default'};
 329 }
 330
 331 sub _init
 332                 {
 333         my $self = shift;
 334         $self->url($URL);
 335         $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
 336         $self->{'_INPUT_SPEC'}    = $INPUT_SPEC;
 337         $self->{'_RESULT_SPEC'}   = $RESULT_SPEC;
 338         $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
 339         return $self;
 340 }
 341
 342 sub _run {
 343     my $self = shift;
 344
 345     # format the sequence into fasta
 346         $self->delay(1);
 347     # delay repeated calls by default by 3 sec, set delay() to change
 348     $self->sleep;
 349
 350     $self->status('TERMINATED_BY_ERROR');
 351
 352     my $request = POST $self->url,
 353             Content      => [sequence     => $self->seq->seq(),
 354                                                          protein_id   => $self->protein_id(),
 355                                                          motif_option => 'all',
 356                                                          motifs       => '',
 357                                                          motif_groups => '',
 358                                                          stringency   => $self->stringency(),
 359                                                          domain_flag  => '',
 360                                                          submit       => "Submit Request",
 361                                                         ];
 362         ## raw html report,
 363     my $content = $self->request($request);
 364     my $text    = $content->content;
 365
 366         ##access result data from tag in html
 367         my @parsed_Results = ();
 368         my @unwantedParams = qw(db source class);
 369         my @results        = split /sitestats\.phtml\?/, $text;
 370         shift @results;
 371
 372         ##this module generates 'parsed' output directly from html,
 373         ## avoids having toparse twice.
 374
 375         for my $hit (@results) {
 376                 ## get results string
 377                 my ($res) = $hit =~ /^(.+?)"/;
 378
 379                 #get key value pairs
 380                 my %params = $res =~/(\w+)=([^&]+)/g;
 381
 382                 ##remove unwanted data from hash
 383                 map{delete $params{$_}} @unwantedParams;
 384                 push @parsed_Results, \%params;
 385         }
 386
 387         ## now generate text output in table format
 388         my $out_Str = '';
 389         $out_Str   .=  $self->_make_header(\@parsed_Results);
 390         $out_Str   .=  $self->_add_data(\@parsed_Results);
 391
 392
 393     $self->{'_result'} = $out_Str;
 394         $self->{'_parsed'} = \@parsed_Results;
 395
 396         ## is successsful if there are results or if there are no results and
 397         ## this beacuse there are no matches, not because of parsing errors etc.
 398     $self->status('COMPLETED') if $text ne ''       &&
 399         (scalar @results > 0 ||
 400         (scalar @results == 0 && $text =~/No sites found/));
 401     if ($text =~ /server\s+error/i) {
 402         $self->warn("There was an internal server error !- text below") ;
 403                 $self->warn($text);
 404         return;
 405     }
 406 }
 407
 408 sub _process_arguments {
 409
 410     # extra checking for sequence length
 411     # mitoprot specific argument testing
 412     my ($self, $args) = @_;
 413     #use base checking for existence of mandatory fields
 414     $self->SUPER::_process_arguments($args);
 415
 416    # specific requirements
 417    $self->throw("Sequence must be > 15 amino acids long!")
 418            if $self->seq->length < 15;
 419    $self->throw("Sequence must be protein")
 420           unless $self->seq->alphabet() eq 'protein';
 421 }
 422
 423 sub _make_header {
 424         my ($self, $res) = @_;
 425         my $header = '';
 426         for my $k (sort keys %{$res->[0]} ){
 427                 next if $k eq 'sequence';
 428                 $header .= $k;
 429                 $header .= ' 'x(12 -length($k));
 430         }
 431         $header .= "sequence\n\n";
 432         return $header;
 433 }
 434
 435 sub _add_data {
 436         my ($self, $res) = @_;
 437         my $outstr = '';
 438         for my $hit  (@$res) {
 439                 for my $k (sort keys %$hit ){
 440                         next if $k eq 'sequence';
 441                         $outstr .= $hit->{$k};
 442                         $outstr .= ' 'x(12 - length($hit->{$k}));
 443                         }
 444                 $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
 445         }
 446         return $outstr;
 447
 448
 449 }
 450
 451
 452 1;