Bio/Tools/Analysis/Protein/Scansite.pm

   1 #
   2 # BioPerl module for Bio::Tools::Analysis::Protein::Scansite
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
   7 #
   8 # Copyright Richard Adams
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::Analysis::Protein::Scansite - a wrapper around the Scansite server
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::Tools::Analysis::Protein::Scansite;
  21
  22   my $seq; # a Bio::PrimarySeqI object
  23
  24   my $tool = Bio::Tools::Analysis::Protein::Scansite->new
  25      ( -seq => $seq->primary_seq );
  26
  27   # run Scansite prediction on a sequence
  28   $tool->run();
  29
  30   # alternatively you can say
  31   $tool->seq($seq->primary_seq)->run;
  32
  33   die "Could not get a result" unless $tool->status =~ /^COMPLETED/;
  34
  35   print $tool->result;     # print raw prediction to STDOUT
  36
  37   foreach my $feat ( $tool->result('Bio::SeqFeatureI') ) {
  38
  39       # do something to SeqFeature
  40       # e.g. print as GFF
  41       print $feat->gff_string, "\n";
  42       # or store within the sequence - if it is a Bio::RichSeqI
  43       $seq->add_SeqFeature($feat);
  44
  45  }
  46
  47 =head1 DESCRIPTION
  48
  49 This class is a wrapper around the Scansite 2.0 server which produces
  50 predictions for serine, threonine and tyrosine phosphorylation sites
  51 in eukaryotic proteins. At present this is a basic wrapper for the
  52 "Scan protein by input sequence" functionality, which takes a sequence
  53 and searches for motifs, with the option to select the search
  54 stringency. At present, searches for specific phosphorylation
  55 sites are not supported; all predicted sites are returned.
  56
  57 =head2 Return formats
  58
  59 The Scansite results can be obtained in several formats:
  60
  61 =over 3
  62
  63 =item 1.
  64
  65 By calling
  66
  67   my $res = $tool->result('');
  68
  69 $res holds a string of the predicted sites in tabular format.
  70
  71 =item 2.
  72
  73 By calling
  74
  75   my $data_ref = $tool->result('value')
  76
  77 $data_ref is a reference to an array of hashes. Each element in the
  78 array represents a predicted phosphorylation site. The hash keys are
  79 the names of the data fields,i.e.,
  80
  81     'motif'      => 'Casn_Kin1'       # name of kinase
  82     'percentile' => 0.155             # see Scansite docs
  83     'position'   => 9                 # position in protein
  84     'protein'    => 'A1'              # protein id
  85     'score'      => 0.3696            # see Scansite docs
  86     'sequence'   => 'ASYFDTASYFSADAT' # sequence surrounding site
  87     'site'       => 'S9'              # phosphorylated residue
  88     'zscore'     => '-3.110'          # see Scansite docs
  89
  90 =item 3.
  91
  92 By calling
  93
  94   my @fts = $tool->Result('Bio::SeqFeatureI');
  95
  96 which returns an array of L<Bio::SeqFeatureI> compliant objects with
  97 primary tag value 'Site' and tag names of 'motif', 'score',
  98 'sequence', 'zscore' as above.
  99
 100 =back
 101
 102 See L<http://scansite.mit.edu/>.
 103
 104 This inherits Bio::SimpleAnalysisI which hopefully makes it easier to
 105 write wrappers on various services. This class uses a web resource and
 106 therefore inherits from L<Bio::WebAgent>.
 107
 108 =head1 SEE ALSO
 109
 110 L<Bio::SimpleAnalysisI>,
 111 L<Bio::WebAgent>
 112
 113 =head1 FEEDBACK
 114
 115 =head2 Mailing Lists
 116
 117 User feedback is an integral part of the evolution of this and other
 118 Bioperl modules. Send your comments and suggestions preferably to one
 119 of the Bioperl mailing lists.  Your participation is much appreciated.
 120
 121   bioperl-l@bioperl.org                  - General discussion
 122   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 123
 124 =head2 Support
 125
 126 Please direct usage questions or support issues to the mailing list:
 127
 128 I<bioperl-l@bioperl.org>
 129
 130 rather than to the module maintainer directly. Many experienced and
 131 reponsive experts will be able look at the problem and quickly
 132 address it. Please include a thorough description of the problem
 133 with code and data examples if at all possible.
 134
 135 =head2 Reporting Bugs
 136
 137 Report bugs to the Bioperl bug tracking system to help us keep track
 138 the bugs and their resolution.  Bug reports can be submitted via the
 139 web:
 140
 141   https://github.com/bioperl/bioperl-live/issues
 142
 143 =head1 AUTHORS
 144
 145 Richard Adams, Richard.Adams@ed.ac.uk,
 146
 147 =head1 APPENDIX
 148
 149 The rest of the documentation details each of the object
 150 methods. Internal methods are usually preceded with a _
 151
 152 =cut
 153
 154
 155 # Let the code begin...
 156
 157
 158 package Bio::Tools::Analysis::Protein::Scansite;
 159 use vars qw($FLOAT @STRINGENCY);
 160 use strict;
 161 use IO::String;
 162 use Bio::SeqIO;
 163 use HTTP::Request::Common qw(POST);
 164 use Bio::SeqFeature::Generic;
 165
 166 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
 167
 168 $FLOAT = '[+-]?\d*\.\d*';
 169 @STRINGENCY = qw(High Medium Low);
 170 my $URL = 'http://scansite.mit.edu/cgi-bin/motifscan_seq';
 171
 172
 173     my $ANALYSIS_SPEC =
 174         {
 175          'name'        => 'Scansite',
 176          'type'        => 'Protein',
 177          'version'     => '2.0',
 178          'supplier'    => 'Massachusetts Institute of Technology',
 179          'description' => 'Prediction of serine, threonine and tyrosine
 180                              phosphorylation sites in eukaryotic proteins',
 181         };
 182
 183     my $INPUT_SPEC =
 184         [
 185          {
 186           'mandatory' => 'true',
 187           'type'      => 'Bio::PrimarySeqI',
 188           'name'      => 'seq',
 189          },
 190          {
 191           'mandatory' => 'false',
 192           'type'      => 'text',
 193           'name'      => 'protein_id',
 194           'default'   => 'unnamed',
 195          },
 196          {
 197           'mandatory' => 'false',
 198           'type'      => 'text',
 199           'name'      => 'stringency',
 200           'default'   => 'High',
 201          },
 202         ];
 203
 204     my $RESULT_SPEC =
 205         {
 206          ''                 => 'bulk',  # same as undef
 207          'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
 208          'raw'              => 'Array of {motif=>, percentile=>, position=>,
 209                                           protein=>, score=>, site=>, zscore=>
 210                                           sequence=>
 211                                          }',
 212         };
 213
 214
 215 =head2 result
 216
 217  Name    : result
 218  Usage   : $job->result (...)
 219  Returns : a result created by running an analysis
 220  Args    : none (but an implementation may choose
 221            to add arguments for instructions how to process
 222            the raw result)
 223
 224 The method returns a scalar representing a result of an executed
 225 job. If the job was terminated by an error, the result may contain
 226 an error message instead of the real data.
 227
 228 This implementation returns differently processed data depending on
 229 argument:
 230
 231 =over 3
 232
 233 =item undef
 234
 235 Returns the raw ASCII data stream but without HTML tags
 236
 237 =item 'Bio::SeqFeatureI'
 238
 239 The argument string defined the type of bioperl objects returned in an
 240 array.  The objects are L<Bio::SeqFeature::Generic>.
 241
 242 =item 'parsed'
 243
 244 Returns a reference to an array of hashes containing the data of one
 245 phosphorylation site prediction. Key values are:
 246
 247 motif, percentile, position, protein, score, site, zscore,  sequence.
 248
 249
 250 =back
 251
 252
 253 =cut
 254
 255 sub result {
 256     my ($self,$value) = @_;
 257         if( !exists($self->{'_result'}) || $self->status ne 'COMPLETED'){
 258                 $self->throw("Cannot get results, analysis not run!");
 259                 }
 260     my @fts;
 261
 262     if ($value ) {
 263                 if ($value eq 'Bio::SeqFeatureI') {
 264                         for my $hit (@{$self->{'_parsed'}}) {
 265                                 push @fts, Bio::SeqFeature::Generic->new(
 266                                         -start       => $hit->{'position'},
 267                                         -end         => $hit->{'position'},
 268                                     -primary_tag => 'Site',
 269                                         -source      => 'Scansite',
 270                                         -tag => {
 271                                                 score     => $hit->{'score'},
 272                                                 zscore    => $hit->{'zscore'},
 273                                                 motif     => $hit->{'motif'},
 274                                                 site      => $hit->{'site'},
 275                                                 sequence  => $hit->{'sequence'},
 276                                                         },
 277                                 );
 278                         }
 279                         return @fts;
 280                 }
 281                 elsif ($value eq 'meta') {
 282                         $self->throw("No meta sequences available in this analysis!");
 283                         }
 284                 ## else get here
 285                 return $self->{'_parsed'};
 286     }
 287
 288     return $self->{'_result'};
 289 }
 290
 291 =head2  stringency
 292
 293  Usage    : $job->stringency(...)
 294  Returns  : The significance stringency of a prediction
 295  Args     : None (retrieves value) or 'High', 'Medium' or 'Low'.
 296  Purpose  : Get/setter of the stringency to be sumitted for analysis.
 297
 298 =cut
 299
 300 sub stringency {
 301    my ($self,$value) = @_;
 302    if( $value) {
 303        if (! grep{$_=~ /$value/i}@STRINGENCY ) {
 304            $self->throw("I need a stringency of [".
 305                                                 join " ", @STRINGENCY    .
 306                                                 "], not [$value]");
 307        }
 308        $self->{'_stringency'} = $value;
 309        return $self;
 310    }
 311    return $self->{'_stringency'} || $self->input_spec->[2]{'default'} ;
 312 }
 313
 314 =head2  protein_id
 315
 316  Usage    : $job->protein_id(...)
 317  Returns  : The sequence id of the protein or 'unnamed' if not set.
 318  Args     : None
 319  Purpose  : Getter of the seq_id. Returns the display_id of the sequence
 320             object.
 321
 322 =cut
 323
 324 sub protein_id {
 325         my $self = shift;
 326         return defined ($self->seq())? $self->seq->display_id()
 327                                      : $self->input_spec->[1]{'default'};
 328 }
 329
 330 sub _init
 331                 {
 332         my $self = shift;
 333         $self->url($URL);
 334         $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
 335         $self->{'_INPUT_SPEC'}    = $INPUT_SPEC;
 336         $self->{'_RESULT_SPEC'}   = $RESULT_SPEC;
 337         $self->{'_ANALYSIS_NAME'} = $ANALYSIS_SPEC->{'name'};
 338         return $self;
 339 }
 340
 341 sub _run {
 342     my $self = shift;
 343
 344     # format the sequence into fasta
 345         $self->delay(1);
 346     # delay repeated calls by default by 3 sec, set delay() to change
 347     $self->sleep;
 348
 349     $self->status('TERMINATED_BY_ERROR');
 350
 351     my $request = POST $self->url,
 352             Content      => [sequence     => $self->seq->seq(),
 353                                                          protein_id   => $self->protein_id(),
 354                                                          motif_option => 'all',
 355                                                          motifs       => '',
 356                                                          motif_groups => '',
 357                                                          stringency   => $self->stringency(),
 358                                                          #domain_flag  => '',
 359                                                          submit       => "Submit Request",
 360                                                         ];
 361         ## raw html report,
 362     my $content = $self->request($request);
 363     my $text    = $content->content;
 364
 365         ##access result data from tag in html
 366         my @parsed_Results = ();
 367         my @unwantedParams = qw(db source class);
 368         my @results        = split /sitestats\.phtml\?/, $text;
 369         shift @results;
 370
 371         ##this module generates 'parsed' output directly from html,
 372         ## avoids having toparse twice.
 373
 374         for my $hit (@results) {
 375                 ## get results string
 376                 my ($res) = $hit =~ /^(.+?)"/;
 377
 378                 #get key value pairs
 379                 my %params = $res =~/(\w+)=([^&]+)/g;
 380
 381                 ##remove unwanted data from hash
 382                 map{delete $params{$_}} @unwantedParams;
 383                 push @parsed_Results, \%params;
 384         }
 385
 386         ## now generate text output in table format
 387         my $out_Str = '';
 388         $out_Str   .=  $self->_make_header(\@parsed_Results);
 389         $out_Str   .=  $self->_add_data(\@parsed_Results);
 390
 391
 392     $self->{'_result'} = $out_Str;
 393         $self->{'_parsed'} = \@parsed_Results;
 394
 395         ## is successsful if there are results or if there are no results and
 396         ## this beacuse there are no matches, not because of parsing errors etc.
 397     $self->status('COMPLETED') if $text ne ''       &&
 398         (scalar @results > 0 ||
 399         (scalar @results == 0 && $text =~/No sites found/));
 400     if ($text =~ /server\s+error/i) {
 401         $self->throw("Internal server error:\n\n $text");
 402         return;
 403     }
 404 }
 405
 406 sub _process_arguments {
 407
 408     # extra checking for sequence length
 409     # mitoprot specific argument testing
 410     my ($self, $args) = @_;
 411     #use base checking for existence of mandatory fields
 412     $self->SUPER::_process_arguments($args);
 413
 414    # specific requirements
 415    $self->throw("Sequence must be > 15 amino acids long!")
 416            if $self->seq->length < 15;
 417    $self->throw("Sequence must be protein")
 418           unless $self->seq->alphabet() eq 'protein';
 419 }
 420
 421 sub _make_header {
 422         my ($self, $res) = @_;
 423         my $header = '';
 424         for my $k (sort keys %{$res->[0]} ){
 425                 next if $k eq 'sequence';
 426                 $header .= $k;
 427                 $header .= ' 'x(12 -length($k));
 428         }
 429         $header .= "sequence\n\n";
 430         return $header;
 431 }
 432
 433 sub _add_data {
 434         my ($self, $res) = @_;
 435         my $outstr = '';
 436         for my $hit  (@$res) {
 437                 for my $k (sort keys %$hit ){
 438                         next if $k eq 'sequence';
 439                         $outstr .= $hit->{$k};
 440                         $outstr .= ' 'x(12 - length($hit->{$k}));
 441                         }
 442                 $outstr .= $hit->{'sequence'}. "\n" if $hit->{'sequence'};
 443         }
 444         return $outstr;
 445
 446
 447 }
 448
 449
 450 1;