maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Tools / Analysis / Protein / ELM.pm
blobef71958eeb2e3cfd8c284ae1cdf6dd992687595b
2 # BioPerl module for Bio::Tools::Analysis::Protein::ELM
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Richard Adams <richard.adams@ed.ac.uk>
8 # Copyright Richard Adams
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Analysis::Protein::ELM - a wrapper around the ELM server which predicts short functional motifs on amino acid sequences
18 =head1 SYNOPSIS
20 # get a Bio::Seq object to start with, or a Bio::PrimaryI object.
22 my $tool = Bio::Tools::Analysis::Protein::ELM->
23 new(seq => $seqobj->primary_seq() );
24 $tool->compartment(['ER', 'Golgi']);
25 $tool->species(9606);
26 $tool->run;
27 my @fts = $tool->Result('Bio::SeqFeatureI');
28 $seqobj->addSeqFeature(@fts);
30 =head1 DESCRIPTION
32 This module is a wrapper around the ELM server L<http://elm.eu.org/>
33 which predicts short functional motifs on amino acid sequences.
35 False positives can be limited by providing values for the species
36 and cellular compartment of the protein. To set the species attribute,
37 use either a L<Bio::Species> object or an NCBI taxon ID number. To set
38 the cell compartment attribute (any number of compartments can be
39 chosen) use an array reference to a list of compartment names.
41 Results can be obtained either as raw text output, parsed into a
42 data structure, or as Bio::SeqFeature::Generic objects.
44 =head1 SEE ALSO
46 L<Bio::SimpleAnalysisI>,
47 L<Bio::WebAgent>
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to one
55 of the Bioperl mailing lists. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 https://github.com/bioperl/bioperl-live/issues
79 =head1 AUTHORS
81 Richard Adams, Richard.Adams@ed.ac.uk,
83 =head1 APPENDIX
85 The rest of the documentation details each of the object
86 methods. Internal methods are usually preceded with a _
88 =cut
90 use strict;
91 package Bio::Tools::Analysis::Protein::ELM;
92 use vars qw(%cc);
93 use HTML::HeadParser;
94 use Bio::SeqFeature::Generic;
95 use HTTP::Request::Common qw(POST);
96 use IO::String;
97 use base qw(Bio::Tools::Analysis::SimpleAnalysisBase);
99 ## valid cell compartments ##
100 %cc = (
101 all => 1,
102 nucleus => 'GO:0005634',
103 extracellular => 'GO:0005576',
104 cytoplasm => 'GO:0005737',
105 peroxisome => 'GO:0005777',
106 glycosome => 'GO:0020015',
107 glyoxisome => 'GO:0009514',
108 golgi => 'GO:0005794',
109 er => 'GO:0005783',
110 lysosome => 'GO:0005764',
111 endosome => 'GO:0005768',
112 plasma_membrane=> 'GO:0005886',
115 my $URL = 'http://elm.eu.org/cgimodel.py';
116 my $ANALYSIS_NAME = 'ELM';
117 my $INPUT_SPEC =
120 'mandatory' => 'true',
121 'type' => 'Bio::PrimarySeqI',
122 'name' => 'seq',
125 'mandatory' => 'false',
126 'type' => 'taxon_id or Bio::Species object',
127 'name' => 'species',
128 'default' => '9606',
131 'mandatory' => 'false',
132 'type' => 'string',
133 'name' => 'compartment',
134 'default' => [1],
138 my $RESULT_SPEC =
140 '' => 'bulk', # same as undef
141 'Bio::SeqFeatureI' => 'ARRAY of Bio::SeqFeature::Generic',
142 'parsed' => '{motif1_name=>{locus=>[],
143 peptide=>[],
144 regexp=>[]
148 my $ANALYSIS_SPEC= {name => 'ELM',
149 type => 'Protein',
150 version => 'n/a',
151 supplier =>'BioComputing Unit, EMBL',
152 description =>'Prediction of linear functional motifs
153 in proteins',
154 reference => 'NAR, 31:3625-3630'};
157 sub _init {
158 my $self = shift;
159 $self->url($URL);
160 $self->{'_ANALYSIS_SPEC'} = $ANALYSIS_SPEC;
161 $self->{'_INPUT_SPEC'} = $INPUT_SPEC;
162 $self->{'_RESULT_SPEC'} = $RESULT_SPEC;
163 $self->{'_ANALYSIS_NAME'} = $ANALYSIS_NAME;
164 return $self;
167 =head2 compartment
169 name : compartment
170 usage : $elm->compartment(['golgi', 'er']);
171 purpose : get/setter for cell compartment specifications
172 arguments : None, single compartment string or ref to array of
173 compartment names.
174 returns : Array of compartment names (default if not previously set).
176 =cut
178 sub compartment {
180 my ($self, $arg) = @_;
181 if ($arg) {
183 # convert to array ref if not one already
184 if (ref ($arg) ne 'ARRAY') {
185 $arg = [$arg];
188 ## now add params if valid
189 for my $param (@$arg) {
190 if (exists($cc{lc($param)})) {
191 push @{$self->{'_compartment'}} , $cc{$param};
192 } else {
193 $self->warn("invalid argument ! Must be one of " .
194 join "\n", keys %cc );
196 } #end of for loop
198 } #endif $arg
199 return defined($self->{'_compartment'})? $self->{'_compartment'}
200 : $self->input_spec()->[2]{'default'};
204 =head1 species
206 name : species
207 usage : $tool->species('9606');
208 purpose : get/setter for species selection for ELM server
209 arguments : none, taxon_id or Bio::Species object
210 returns : a string of the ncbi taxon_id
212 =cut
214 sub species {
215 my ($self, $arg) = @_;
217 if ($arg) {
218 if (ref($arg) && $arg->isa('Bio::Species')) {
219 $self->{'_species'} = $arg->ncbi_taxid();
220 } elsif ($arg =~ /^\d+$/) {
221 $self->{'_species'} = $arg;
222 } else {
223 $self->warn("Argument must be a Bio::Species object or ".
224 " an integer NCBI taxon id. ");
226 } #end if $arg
227 return defined($self->{'_species'})?$self->{'_species'}
228 :$self->input_spec()->[1]{'default'};
232 sub _run {
233 my $self = shift;
234 $self->delay(1);
235 # delay repeated calls by default by 3 sec, set delay() to change
236 #$self->sleep;
237 $self->status('TERMINATED_BY_ERROR');
239 #### this deals with being able to submit multiple checkboxed
240 #### slections
242 #1st of all make param array
243 my @cc_str;
244 my @cmpts = @{$self->compartment()};
245 for (my $i = 0; $i <= $#cmpts ; $i++) {
246 splice @cc_str, @cc_str, 0, 'userCC',$cmpts[$i];
248 my %h = (swissprotId => "",
249 sequence => $self->seq->seq,
250 userSpecies => $self->species,
251 typedUserSpecies => '',
252 fun => "Submit");
253 splice (@cc_str, @cc_str,0, ( map{$_, $h{$_}} keys %h));
256 my $request = POST $self->url(),
257 Content_Type => 'form-data',
258 Content => \@cc_str;
259 $self->debug( $request->as_string);
260 my $r1 = $self->request($request);
261 if ( $r1->is_error ) {
262 $self->warn(ref($self)." Request Error:\n".$r1->as_string);
263 return;
266 my $text = $r1->content;
267 my ($url) = $text =~ /URL=\S+(fun=\S+r=\d)/s;
268 #$url =~ s/amp;//g ;
269 my ($resp2);
270 $url = $URL . "?" .$url;
271 while (1) {
272 my $req2 = HTTP::Request->new(GET=>$url);
273 my $r2 = $self->request ($req2);
274 if ( $r2->is_error ) {
275 $self->warn(ref($self)." Request Error:\n".$r2->as_string);
276 return;
278 $resp2 = $r2->content();
280 if ($resp2 !~ /patient/s) {
281 $self->status('COMPLETED');
282 $resp2=~ s/<[^>]+>/ /sg;
283 $self->{'_result'} = $resp2;
284 return;
285 } else {
286 print "." if $self->verbose > 0;
287 $self->sleep(1);
292 =head1 result
294 name : result
295 usage : $tool->result('Bio::SeqFeatureI');
296 purpose : parse results into sequence features or basic data format
297 arguments : 1. none (retrieves raw text without html)
298 2. a value (retrieves data structure)
299 3. 'Bio::SeqFeatureI' (returns array of sequence features)
300 tag names are : {method => 'ELM', motif => motifname,
301 peptide => seqeunce of match,
302 concensus => regexp of match}.
303 returns : see arguments.
305 =cut
307 sub result {
308 my ($self, $val) = @_;
309 if ($val) {
310 if (!exists($self->{'_parsed'}) ) {
311 $self->_parse_raw();
313 if ($val eq 'Bio::SeqFeatureI') {
314 my @fts;
315 for my $motif (keys %{$self->{'_parsed'}}) {
316 for (my $i = 0; $i< scalar @{$self->{'_parsed'}{$motif}{'locus'}};$i++) {
317 my ($st, $end) = split /\-/, $self->{'_parsed'}{$motif}{'locus'}[$i];
318 push @fts, Bio::SeqFeature::Generic->new
320 -start => $st,
321 -end => $end,
322 -primary_tag => 'Domain',
323 -source => 'ELM',
324 -tag => {
325 method => 'ELM',
326 motif => $motif,
327 peptide => $self->{'_parsed'}{$motif}{'peptide'}[$i],
328 concensus => $self->{'_parsed'}{$motif}{'regexp'}[0],
332 return @fts;
333 } #end if BioSeqFeature
334 return $self->{'_parsed'};
335 } #endif ($val)
336 return $self->{'_result'};
339 ## internal sub to parse raw data into internal data structure which is cached.
340 sub _parse_raw {
341 my $self = shift;
342 my $result = IO::String->new($self->{'_result'});
343 my $in_results = 0;
344 my $name;
345 my %results;
346 my $last;
347 while (my $l = <$result>) {
348 next unless $in_results > 0 ||$l =~ /^\s+Elm\s+Name\s+Instances/;
349 $in_results++; #will be set whnstart of results reached.
350 last if $l =~ /List of excluded/;
351 next unless $in_results >1;
353 my @line_parts = split /\s+/, $l;
354 shift @line_parts;
355 ## if result has motif name on 1 line
356 if (scalar @line_parts == 1 && $line_parts[0]=~ /^\s*(\w+_\w+)/) {
357 $name = $1;
358 next;
360 ## else if is line with loci /seq matches
361 elsif (@line_parts > 1) {
362 my $index = 0; ## array index
363 my $read_loci = 0; ## flag to know that loci are being read
364 while ($index <= $#line_parts) {
365 my $word = $line_parts[$index++];
366 if ($read_loci ==0 && $word =~/_/) {
367 $name = $word;
368 } elsif ($read_loci == 0 && $word =~ /^\w+$/ ) {
369 push @{$results{$name}{'peptide'}}, $word;
370 } elsif ($word =~ /\d+\-\d+/) {
371 $read_loci = 1;
372 push @{$results{$name}{'locus'}}, $word;
373 } else { ## only get here if there are elements
374 last;
376 } #end of while
377 push @{$results{$name}{'regexp'}}, $line_parts[$#line_parts];
378 } #end of elsif
380 } #end of while
382 $self->{'_parsed'} = \%results;