Supply TEMPLATE and SUFFIX for temporary query sequence files.
[bioperl-run.git] / scripts / panalysis.PLS
blob7680bd925bf70235bd809bbb5bd4a1175892bc3c
1 #!/usr/bin/perl -w
3 # A client showing how to use Bio::Tools::Run::Analysis module,
4 # a module for executing and controlling local or remote analysis tools.
5 # It also calls methods from Bio::Tools::Run::AnalysisFactory module.
7 # It has many options in order to cover as many methods as
8 # possible. Because of that, it can be also used as a fully
9 # functional command-line client for accessing various analysis
10 # tools.
12 # Usage: ./panalysis.PLS -h
13 # or: perldoc panalysis.PLS
15 # martin.senger@gmail.com
16 # July 2002
18 # $Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $
19 #-----------------------------------------------------------------------------
21 use strict;
23 sub get_usage {
24 return <<"END_OF_USAGE";
25 Usage:
26 panalysis.PLS [options] [input-data]
28 where 'options' are:
29 -A <access> access method (default 'soap')
30 -l <location> where are the analyses
31 -n <name> name of an analysis
32 -j <job-id> ID of a previously created job
34 -L list all available analyses
35 -c list all available categories
36 -C <category> show all analyses in given category
38 -i, -I show specification of data inputs
39 -o, -O show specification of results
40 -a show specification of the analysis
41 -d show analysis metadata (XML)
43 -b create job from [input-data]
44 (default: create a job also without -b option
45 if there is no -j option and if there are some
46 'input-data' on the command-line)
47 -x create job from [input-data] and run it
48 -w create job from [input-data], run it and wait for it
49 -x -j <job-id> run a previously created job
50 -w -j <job-id> run a previously created job and wait for it
51 -k -j <job-id> kill a previously created job
54 -s show job status
55 -t show all job times
56 -T <acbfe> show some job times (all, created, begun, finished, elapsed)
57 -e show job last event (XML)
59 -r retrieve all results
60 -R <list> retrieve named results; comma-separated list, each item:
61 <result-name>
62 <result-name>=<filename>
63 <result-name>=@[filename-template]
64 <result-name>=?[filename-template]
65 where 'filename-template' can contain:
66 * ... will be replaced by a unique number
67 \$ANALYSIS ... will be replaced by an analysis name
68 \$RESULT ... will be replaced by a result name
69 any other characters (suitable for filenames)
71 -z at the end remove job and all its results
73 -h this help
74 -v, -V show version(s)
75 -q be less verbose
77 where 'input-data' are:
78 <input-data-name>=<value>...
79 <input-data-name>=@<filename-with-value>...
81 Environment variables:
82 HTTPPROXY HTTP proxy server
83 HTTPTIMEOUT HTTP timeout (0 means no timeout at all)
84 RESULT_FILENAME_TEMPLATE template for inventing filenames for results
86 For more details type: perldoc panalysis.PLS
88 END_OF_USAGE
91 BEGIN {
92 # add path to the directory with this script
93 my $mylib;
94 ($mylib = $0) =~ s|/[^/]+$||;
95 unshift @INC, $mylib;
97 # be prepare for command-line options/arguments
98 use Getopt::Std;
100 # general options
101 use vars qw/ $opt_h $opt_v $opt_V $opt_q /;
102 # specialized options
103 use vars qw/ $opt_A $opt_l $opt_n $opt_j /; # service
104 use vars qw/ $opt_L $opt_c $opt_C /; # factory
105 use vars qw/ $opt_d $opt_i $opt_I $opt_o $opt_O $opt_a /; # metadata
106 use vars qw/ $opt_x $opt_w $opt_k $opt_s $opt_e $opt_t $opt_T $opt_b /; # job
107 use vars qw/ $opt_r $opt_R /; # results
108 use vars qw/ $opt_z /; # cleaning
109 my $switches = 'ACjlnRT'; # switches taking an argument (a value)
110 getopt ($switches);
112 use vars qw($VERSION $Revision);
114 # set the version for version checking
115 $VERSION = do { my @r = (q[$Revision: 1.10 $] =~ /\d+/g); sprintf "%d.%-02d", @r };
116 $Revision = q[$Id: panalysis.PLS,v 1.10 2006-07-04 22:23:36 mauricio Exp $];
119 # help wanted?
120 if ($opt_h) {
121 print get_usage;
122 exit 0;
125 # print version of this script and exit
126 if ($opt_v) {
127 print "$0 $VERSION\n";
128 exit 0;
132 use Bio::Tools::Run::Analysis; # to access analysis tools directly
133 use Bio::Tools::Run::AnalysisFactory; # to access list/factory of analysis tools
135 # --- create a factory object;
136 # the new() method understands the following parameters:
137 # -location (taken from '-l' option if given)
138 # -access (taken from '-A' option, default is 'soap')
140 # Additionally, it uses env. variable HTTPPROXY to create parameter
141 # '-httpproxy', and env. variable HTTPTIMEOUT to set max HTTP timeout.
143 my @access = ('-access', $opt_A) if defined $opt_A;
144 my @location = ('-location', $opt_l) if defined $opt_l;
145 my @httpproxy = ('-httpproxy', $ENV{'HTTPPROXY'}) if defined $ENV{'HTTPPROXY'};
146 my @timeout = ('-timeout', $ENV{'HTTPTIMEOUT'}) if defined $ENV{'HTTPTIMEOUT'};
147 my $factory = new Bio::Tools::Run::AnalysisFactory (@location, @httpproxy, @timeout);
149 # --- create an analysis (service) object;
150 # the new() method understands the following parameters:
151 # -location (taken from '-l' option if given)
152 # -access (taken from '-A' option, default is 'soap')
153 # -name (taken from '-n' option; mandatory!, no default value)
154 # -destroy_on_exit (set to true if '-z' option given)
155 # -httpproxy (taken from an env.variable)
156 # -timeout (taken from an env.variable)
158 my @name = ('-name', $opt_n) if defined $opt_n;
159 my @destroy = ('-destroy_on_exit', 0) unless $opt_z;
160 my $service = new Bio::Tools::Run::Analysis (@name, @location, @httpproxy, @timeout, @destroy);
162 die "Stopped. No success in accessing analysis factory.\n" unless $factory;
163 die "Stopped. No success in accessing analysis tools.\n" unless $service;
165 # --- print class and version of "real-workers" and exit
166 if ($opt_V) {
167 print ref $factory, " ", $factory->VERSION . "\n";
168 print ref $service, " ", $service->VERSION . "\n";
169 exit 0;
173 # --- here are methods of the "directory service" (factory)
176 # what categories are available?
177 if ($opt_c) {
178 my $msg = "Available categories";
179 &msg ("$msg\n" . '-' x length ($msg) . "\n");
180 print join ("\n", sort @{ $factory->available_categories }), "\n";
183 # what analyses are available?
184 if ($opt_L) {
185 my $msg = "Available analyses";
186 &msg ("$msg\n" . '-' x length ($msg) . "\n");
187 print join ("\n", sort @{ $factory->available_analyses }), "\n";
190 # what analyses are available in a particular category?
191 if ($opt_C) {
192 my $msg = "Available analyses in category '$opt_C':";
193 &msg ("$msg\n" . '-' x length ($msg) . "\n");
194 print join ("\n", sort @{ $factory->available_analyses ($opt_C) }), "\n";
198 # --- here are methods describing one analysis
201 # print full analysis metadata in XML
202 # ('$service->describe' returns an XML string)
203 print $service->describe . "\n" if $opt_d;
205 # print major characteristics of an analysis
206 # ('$service->analysis_spec' returns a hash reference)
207 if ($opt_a) {
208 my $rh_spec = $service->analysis_spec;
209 my $msg = "Specification of analysis";
210 &msg ("$msg\n" . '-' x length ($msg) . "\n");
211 my ($key, $value);
212 print "Analysis '$opt_n':\n";
213 while (($key, $value) = each %{ $rh_spec }) {
214 print "\t$key => $value\n";
218 # print input specification (either full, or just input data names)
219 # ('$service->input_spec' returns a reference to an array of hashes)
220 if ($opt_i or $opt_I) {
221 my $ra_spec = $service->input_spec;
222 my $msg = "Specification of inputs";
223 &msg ("$msg\n" . '-' x length ($msg) . "\n");
224 my ($key, $value);
225 foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) {
226 print $$_{'name'},"\n";
227 if ($opt_I) {
228 while (($key, $value) = each %{ $_ }) {
229 unless ($key eq 'name') {
230 if (ref $value eq 'ARRAY') { # for 'allowed values'
231 print "\t$key => " . join (", ", @$value) . "\n";
232 } else {
233 print "\t$key => $value\n";
241 # print result specification (either full, or just names of results)
242 # ('$service->result_spec' returns a reference to an array of hashes)
243 if ($opt_o or $opt_O) {
244 my $ra_spec = $service->result_spec;
245 my $msg = "Specification of results";
246 &msg ("$msg\n" . '-' x length ($msg) . "\n");
247 my ($key, $value);
248 foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) {
249 print $$_{'name'},"\n";
250 if ($opt_O) {
251 while (($key, $value) = each %{ $_ }) {
252 print "\t$key => $value\n" unless ($key eq 'name');
259 # --- let's create a job
261 my $job;
262 if ($opt_j) {
263 # ... either by re-creating a previous job
264 $job = $service->create_job ($opt_j);
266 if ($opt_x) {
267 $job->run;
268 } elsif ($opt_w) {
269 $job->wait_for;
270 } elsif ($opt_k) {
271 $job->terminate;
274 } else {
275 # ... or creating a new job using given input data
276 if ($opt_x) {
277 $job = $service->run (\@ARGV);
278 } elsif ($opt_w) {
279 $job = $service->wait_for (\@ARGV);
280 } elsif ($opt_b or @ARGV > 0) {
281 $job = $service->create_job (\@ARGV);
284 # often you need to know the JOB's ID to be able to come back
285 # later and ask for results, status, events etc. - so I print it
286 # here even in quiet mode (option -q) - but to STDERR in order not
287 # to intervene with redirected real results
288 print STDERR "JOB ID: " , $job->id . "\n" if $job;
292 # --- having a job, ask it for something
294 if ($job) {
295 print "JOB STATUS: " . $job->status . "\n" if $opt_s;
296 print "LAST EVENT: " . $job->last_event . "\n" if $opt_e;
298 # ...get job times (all of them in one go, formatted)
299 if ($opt_t) {
300 my $rh_times = $job->times (1); # '1' means 'formatted'
301 print "TIMES:\n";
302 print "\tCreated: " . $$rh_times{'created'} . "\n" if $$rh_times{'created'};
303 print "\tStarted: " . $$rh_times{'started'} . "\n" if $$rh_times{'started'};
304 print "\tEnded: " . $$rh_times{'ended'} . "\n" if $$rh_times{'ended'};
305 print "\tElapsed: " . $$rh_times{'elapsed'} . "\n" if defined $$rh_times{'elapsed'};
308 # ...get individual job characteristics (both formatted and raw)
309 if ($opt_T) {
310 print "CREATED: " . $job->created (1) . " (" . $job->created . ")\n" if $opt_T =~ /a|c/;
311 print "STARTED: " . $job->started (1) . " (" . $job->started . ")\n" if $opt_T =~ /a|b/;
312 print "ENDED: " . $job->ended (1) . " (" . $job->ended . ")\n" if $opt_T =~ /a|f/;
313 print "ELAPSED: " . $job->elapsed . "\n" if $opt_T =~ /a|e/;
316 # retrieve results
317 my $rh_results;
318 if ($opt_R) {
319 $rh_results = $job->results (split /\s*,\s*/, $opt_R);
320 } elsif ($opt_r) {
321 $rh_results = $job->results ('?');
323 if ($rh_results) {
324 foreach my $name (sort keys %$rh_results) {
325 my $msg = "RESULT: $name";
326 &msg ("$msg\n" . '-' x length ($msg) . "\n");
328 if (ref $$rh_results{$name}) {
329 # ... this is probably what you do not want (binary on terminal);
330 # unless you wisely used: -R result_name=filename
331 print join ("\n", @{ $$rh_results{$name} }) . "\n";
332 } else {
333 print $$rh_results{$name} . "\n";
339 sub msg {
340 print shift unless $opt_q;
343 __END__
345 =head1 NAME
347 panalysis.PLS - An example/tutorial script how to access analysis tools
349 =head1 SYNOPSIS
351 # run an analysis with your sequence in a local file
352 ./panalysis.PLS -n 'edit.seqret'-w -r \
353 sequence_direct_data=@/home/testdata/my.seq
355 See more examples in the text below.
357 =head1 DESCRIPTION
359 A client showing how to use C<Bio::Tools::Run::Analysis> module, a module for
360 executing and controlling local or remote analysis tools. It also
361 calls methods from the C<Bio::Tools::Run::AnalysisFactory> module, a module
362 providing lists of available analyses.
364 Primarily, this client is meant as an example how to use analysis
365 modules, and also to test them. However, because it has a lot of
366 options in order to cover as many methods as possible, it can be also
367 used as a fully functional command-line client for accessing various
368 analysis tools.
370 =head2 Defining location and access method
372 C<panalysis.PLS> is independent on the access method to the remote
373 analyses (the analyses running on a different machines). The method
374 used to communicate with the analyses is defined by the C<-A> option,
375 with the default value I<soap>. The other possible values (not yet
376 supported, but coming soon) are I<corba> and I<local>.
378 Each access method may have different meaning for parameter C<-l>
379 defining a location of services giving access to the analysis
380 tools. For example, the I<soap> access expects a URL of a Web Service
381 in the C<-l> option, while the I<corba> access may find here a
382 stringified Interoperable Object Reference (IOR).
384 A default location for the I<soap> access is
385 C<http://www.ebi.ac.uk/soaplab/services> which represents services
386 running at European Bioinformatics Institute on top of over hundred
387 EMBOSS analyses (and on top of few others).
389 =head2 Available analyses
391 C<panalysis.PLS> can show a list of available analyses (from the given
392 location using given access method). The C<-L> option shows all
393 analyses, the C<-c> option lists all available categories (a category
394 is a group of analyses with similar functionality or processing
395 similar type of data), and finally the C<-C> option shows only
396 analyses available within the given category.
398 Note, that all these functions are provided by module
399 C<Bio::Tools::Run::AnalysisFactory> (respectively, by one of its
400 access-dependent sub-classes). The module has also a I<factory> method
401 C<create_analysis> which is not used by this script.
403 =head2 Service
405 A C<service> is a higher level of abstraction of an analysis tool. It
406 understands a well defined interface (module C<Bio::AnalysisI>, a fact
407 which allows this script to be independent on the access protocol to
408 various services.
410 The service name must be given by the C<-n> option. This option can be
411 omitted only if you invoked just the C<factory> methods (described
412 above).
414 Each service (representing an analysis tool, a program, or an
415 application) has its description, available by using options C<-a>
416 (analysis name, type, etc.), C<-i>, C<-I> (specification of analysis
417 input data, most important are their names), and C<-o>, C<-O> (result
418 names and their types). The option C<-d> gives the most detailed
419 description in the XML format.
421 The service description is nice but the most important is to use the
422 service for invoking an underlying analysis tool. For each invocation,
423 the service creates a C<job> and feeds it with input data. There are
424 three stages: (a) create a job, (b) run the job, and (c) wait for its
425 completion. Correspondingly. there are three options: the C<-b> which
426 just creates (builds) a job, the C<-x> which creates a job and
427 executes it, and finally C<-w> which creates a job, runs it and blocks
428 the client until the job is finished. Always only one of these options
429 is used (so it does not make sense to use more of them, the
430 C<panalysis.PLS> priorities them in the order C<-x>, C<-w>, and
431 C<-b>).
433 All of these options take input data from the command-line (see next
434 section about it) and all of them return (internally) an object
435 representing a job. There are many methods (options) dealing with the
436 job objects (see one after next section about them).
438 Last note in this section: the C<-b> option is actually optional - a
439 job is created even without this option when there are some input data
440 found on the command-line. You I<have> to use it, however, if you do
441 not pass any data to an analysis tool (an example would be the famous
442 C<Classic::HelloWorld> service).
444 =head2 Input data
446 Input data are given as name/value pairs, put on the command-line with
447 equal sign between name and value. If the I<value> part starts with
448 an un-escaped character C<@>, it is used as a local file name and the
449 C<panalysis.PLS> reads the file and uses its contents instead. Examples:
451 panalysis.PLS -n edit.seqret -w -r
452 sequence_direct_data='tatatctcccc' osformat=embl
454 panalysis.PLS ...
455 sequence_direct_data=@/my/data/my.seq
457 The names of input data come from the C<input specification> that can
458 be shown by the C<-i> or C<-I> options. The input specification (when
459 using option C<-I>) shows also - for some inputs - a list of allowed
460 values. The specification, however, does not tell what input data are
461 mutually exclusive, or what other constrains apply. If there is a
462 conflict, an error message is produced later (before the job starts).
464 Input data are used when any of the options C<-b>, C<-x>, or C<-w> is
465 present, but option C<-j> is not present (see next section about this
466 job option).
468 =head2 Job
470 Each service (defined by a name given in the C<-n> option) can be
471 executed one or more times, with the same, but usually with different
472 input data. Each execution creates a I<job object>. Actually, the job
473 is created even before execution (remember that option C<-b> builds a
474 job but does not execute it yet).
476 Any job, executed or not, is persistent and can be used again later
477 from another invocation of the C<panalysis.PLS> script. Unless you
478 explicitly destroy the job using option C<-z>.
480 A job created by options C<-b>, C<-x> and C<-w> (and by input data)
481 can be accessed in the same C<panalysis.PLS> invocation using various
482 job-related options, the most important are C<-r> and C<-R> for
483 retrieving results from the finished job.
485 However, you can also re-create a job created by a previous
486 invocation. Assuming that you know the job ID (the C<panalysis.PLS>
487 prints it always on the standard error when a new job is created), use
488 option C<-j> to re-create the job.
490 Example:
492 ./panalysis.PLS -n 'edit.seqret'
493 sequence_direct_data=@/home/testdata/my.seq
495 It prints:
497 JOB ID: edit.seqret/bb494b:ef55e47c99:-8000
499 Next invocation (asking to run the job, to wait for its completion and
500 to show job status) can be:
502 ./panalysis.PLS -n 'edit.seqret'
503 -j edit.seqret/bb494b:ef55e47c99:-800
504 -w -s
506 And again later another invocation can ask for results:
508 ./panalysis.PLS -n 'edit.seqret'
509 -j edit.seqret/bb494b:ef55e47c99:-800
512 Here is a list of all job options (except for results, they are in the
513 next section):
515 =over 4
517 =item Job execution and termination
519 There are the same options C<-x> and C<-w> for executing a job and for
520 executing it and waiting for its completion, as they were described
521 above. But now, the options act on a job given by the C<-j> option,
522 now they do not use any input data from the command-line (the input
523 data had to be used when the job was created).
525 Additionally, there is a C<-k> option to kill a running job.
527 =item Job characteristics
529 Other options tell about the job status (C<-s>, about the job
530 execution times (C<-t> and C<-T>, and about the last available event
531 what happened with the job (C<-e>). Note that the event notification is
532 not yet fully implemented, so this option will change in the future to
533 reflect more notification capabilities.
535 =back
537 =head2 Results
539 Of course, the most important on the analysis tools are their
540 results. The results are named (in the similar way as the input data)
541 and they can be retrieved all in one go using option C<-r> (so you do
542 not need to know their names actually), or by specifying (all or some)
543 result names using the C<-R> option.
545 If a result does not exist (either not yet, or the name is wrong) an
546 undef value is returned (no error message produced).
548 Some results are better to save directly into files instead to show
549 them in the terminal window (this applies to the I<binary> results,
550 mostly containing images). The C<panalysis.PLS> helps to deal with
551 binary results by saving them automatically to local files (actually
552 it is the module C<Bio::Tools::Run::Analysis> and its submodules
553 who do help with the binary data).
555 So why not to use a traditional shell re-direction to a file? There are
556 two reasons. First, a job can produce more than one result, so they
557 would be mixed together. But mainly, because each result can consist
558 of several parts whose number is not known in advance and which cannot
559 be mixed together in one file. Again, this is typical for the binary
560 data returning images - an invocation can produce many images.
562 The C<-r> option retrieves all available results and treat them as
563 described by the C<'?'> format below.
565 The C<-R> option has a comma-separated list of result names, each of
566 the names can be either a simple name (as specified by the C<result
567 specification> obtainable using the C<-o> or C<-O> options), or a
568 equal-sign-separated name/format construct suggesting what to do with
569 the result. The possibilities are:
571 =over 4
573 =item result-name
575 It prints given result on the standard output.
577 =item result-name=filename
579 It saves the given result into given file.
581 =item result-name=@
583 It saves the given result into a file whose name is automatically
584 invented, and it guarantees that the same name will not be used in
585 the next invocation.
587 =item result=name=@template
589 It saves the given result into a file whose name is given by the
590 C<template>. The template can contain several strings which are
591 substituted before using it as the filename:
593 =over 4
595 =item Any '*'
597 Will be replaced by a unique number
599 =item $ANALYSIS or ${ANALYSIS}
601 Will be replaced by the current analysis name
603 =item $RESULT or ${RESULT}
605 Will be replaced by the current result name
607 How to tell what to do with results? Each result name
609 =back
611 Additionally, a template can be given as an environment variable
612 C<RESULT_FILENAME_TEMPLATE>. Such variable is used for any result
613 having in its format a simple C<?> or C<@> character.
615 =item result-name=?
617 It first decides whether the given result is binary or not. Then, the
618 binary results are saved into local files whose names are
619 automatically invented, the other results are sent to the standard
620 output.
622 =item result-name=?template
624 The same as above but the filenames for binary files are deduced from
625 the given template (using the same rules as described above).
627 =back
629 Examples:
632 -R report
633 -R report,outseq
634 -R Graphics_in_PNG=@
635 -R Graphics_in_PNG=@$ANALYSIS-*-$RESULT
637 Note that the result formatting will be enriched in the future by
638 using existing data type parsers in bioperl.
640 =head1 FEEDBACK
642 =head2 Mailing Lists
644 User feedback is an integral part of the evolution of this and other
645 Bioperl modules. Send your comments and suggestions preferably to
646 the Bioperl mailing list. Your participation is much appreciated.
648 bioperl-l@bioperl.org - General discussion
649 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
651 =head2 Reporting Bugs
653 Report bugs to the Bioperl bug tracking system to help us keep track
654 of the bugs and their resolution. Bug reports can be submitted via the
655 web:
657 http://redmine.open-bio.org/projects/bioperl/
659 =head1 AUTHOR
661 Martin Senger (martin.senger@gmail.com)
663 =head1 COPYRIGHT
665 Copyright (c) 2003, Martin Senger and EMBL-EBI.
666 All Rights Reserved.
668 This script is free software; you can redistribute it and/or modify
669 it under the same terms as Perl itself.
671 =head1 DISCLAIMER
673 This software is provided "as is" without warranty of any kind.
675 =head1 BUGS AND LIMITATIONS
677 None known at the time of writing this.
679 =cut