rework tempfile handling to explicitly close a filehanle - assigning it to undef...
[bioperl-run.git] / scripts / panalysis.PLS
blob42c933e3ce30ba302b16c5c480fe827f90ec3c79
1 #!/usr/bin/perl -w
3 # A client showing how to use Bio::Tools::Run::Analysis module,
4 # a module for executing and controlling local or remote analysis tools.
5 # It also calls methods from Bio::Tools::Run::AnalysisFactory module.
7 # It has many options in order to cover as many methods as
8 # possible. Because of that, it can be also used as a fully
9 # functional command-line client for accessing various analysis
10 # tools.
12 # Usage: ./analysis -h
13 # or: perldoc analysis.pl
15 # senger@ebi.ac.uk
16 # July 2002
18 # $Id: panalysis.PLS,v 1.2 2003-03-12 13:11:26 senger Exp $
19 #-----------------------------------------------------------------------------
21 use strict;
23 sub get_usage {
24 return <<"END_OF_USAGE";
25 Usage:
26 analysis.pl [options] [input-data]
28 where 'options' are:
29 -A <access> access method (default 'soap')
30 -l <location> where are the analyses
31 -n <name> name of an analysis
32 -j <job-id> ID of a previously created job
34 -L list all available analyses
35 -c list all available categories
36 -C <category> show all analyses in given category
38 -i, -I show specification of data inputs
39 -o, -O show specification of results
40 -a show specification of the analysis
41 -d show analysis metadata (XML)
43 -b create job from [input-data]
44 (default: create a job also without -b option
45 if there is no -j option and if there are some
46 'input-data' on the command-line)
47 -x create job from [input-data] and run it
48 -w create job from [input-data], run it and wait for it
49 -x -j <job-id> run a previously created job
50 -w -j <job-id> run a previously created job and wait for it
51 -k -j <job-id> kill a previoulsy created job
54 -s show job status
55 -t show all job times
56 -T <acbfe> show some job times (all, created, begun, finished, elapsed)
57 -e show job last event (XML)
59 -r retrieve all results
60 -R <list> retrieve named results; comma-separated list, each item:
61 <result-name>
62 <result-name>=<filename>
63 <result-name>=@[filename-template]
64 <result-name>=?[filename-template]
65 where 'filename-template' can contain:
66 * ... will be replaced by a unique number
67 \$ANALYSIS ... will be replaced by an analysis name
68 \$RESULT ... will be replaced by a result name
69 any other characters (suitable for filenames)
71 -z at the end remove job and all its results
73 -h this help
74 -v, -V show version(s)
75 -q be less verbose
77 where 'input-data' are:
78 <input-data-name>=<value>...
79 <input-data-name>=@<filename-with-value>...
81 Environment variables:
82 HTTPPROXY HTTP proxy server
83 HTTPTIMEOUT HTTP timeout (0 means no timeout at all)
84 RESULT_FILENAME_TEMPLATE template for inventing filenames for results
86 For more details type: perldoc analysis.pl
88 END_OF_USAGE
91 BEGIN {
92 # add path to the directory with this script
93 my $mylib;
94 ($mylib = $0) =~ s|/[^/]+$||;
95 unshift @INC, $mylib;
97 # be prepare for command-line options/arguments
98 use Getopt::Std;
100 # general options
101 use vars qw/ $opt_h $opt_v $opt_V $opt_q /;
102 # specialized options
103 use vars qw/ $opt_A $opt_l $opt_n $opt_j /; # service
104 use vars qw/ $opt_L $opt_c $opt_C /; # factory
105 use vars qw/ $opt_d $opt_i $opt_I $opt_o $opt_O $opt_a /; # metadata
106 use vars qw/ $opt_x $opt_w $opt_k $opt_s $opt_e $opt_t $opt_T $opt_b /; # job
107 use vars qw/ $opt_r $opt_R /; # results
108 use vars qw/ $opt_z /; # cleaning
109 my $switches = 'ACjlnRT'; # switches taking an argument (a value)
110 getopt ($switches);
112 use vars qw($VERSION $Revision);
114 # set the version for version checking
115 $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d.%-02d", @r };
116 $Revision = q$Id: panalysis.PLS,v 1.2 2003-03-12 13:11:26 senger Exp $;
119 # help wanted?
120 if ($opt_h) {
121 print get_usage;
122 exit 0;
125 # print version of this script and exit
126 if ($opt_v) {
127 print "$0 $VERSION\n";
128 exit 0;
132 use Bio::Tools::Run::Analysis; # to access analysis tools directly
133 use Bio::Tools::Run::AnalysisFactory; # to access list/factory of analysis tools
135 # --- create a factory object;
136 # the new() method understands the following parameters:
137 # -location (taken from '-l' option if given)
138 # -access (taken from '-A' option, default is 'soap')
140 # Additionally, it uses env. variable HTTPPROXY to create parameter
141 # '-httpproxy', and env. variable HTTPTIMEOUT to set max HTTP timeout.
143 my @access = ('-access', $opt_A) if defined $opt_A;
144 my @location = ('-location', $opt_l) if defined $opt_l;
145 my @httpproxy = ('-httpproxy', $ENV{'HTTPPROXY'}) if defined $ENV{'HTTPPROXY'};
146 my @timeout = ('-timeout', $ENV{'HTTPTIMEOUT'}) if defined $ENV{'HTTPTIMEOUT'};
147 my $factory = new Bio::Tools::Run::AnalysisFactory (@location, @httpproxy, @timeout);
149 # --- create an analysis (service) object;
150 # the new() method understands the following parameters:
151 # -location (taken from '-l' option if given)
152 # -access (taken from '-A' option, default is 'soap')
153 # -name (taken from '-n' option; mandatory!, no default value)
154 # -destroy_on_exit (set to true if '-z' option given)
155 # -httpproxy (taken from an env.variable)
156 # -timeout (taken from an env.variable)
158 my @name = ('-name', $opt_n) if defined $opt_n;
159 my @destroy = ('-destroy_on_exit', 0) unless $opt_z;
160 my $service = new Bio::Tools::Run::Analysis (@name, @location, @httpproxy, @timeout, @destroy);
162 die "Stopped. No success in accessing analysis factory.\n" unless $factory;
163 die "Stopped. No success in accessing analysis tools.\n" unless $service;
165 # --- print class and version of "real-workers" and exit
166 if ($opt_V) {
167 print ref $factory, " ", $factory->VERSION . "\n";
168 print ref $service, " ", $service->VERSION . "\n";
169 exit 0;
173 # --- here are methods of the "directory service" (factory)
176 # what categories are available?
177 if ($opt_c) {
178 my $msg = "Available categories";
179 &msg ("$msg\n" . '-' x length ($msg) . "\n");
180 print join ("\n", sort @{ $factory->available_categories }), "\n";
183 # what analyses are available?
184 if ($opt_L) {
185 my $msg = "Available analyses";
186 &msg ("$msg\n" . '-' x length ($msg) . "\n");
187 print join ("\n", sort @{ $factory->available_analyses }), "\n";
190 # what analyses are available in a particular category?
191 if ($opt_C) {
192 my $msg = "Available analyses in category '$opt_C':";
193 &msg ("$msg\n" . '-' x length ($msg) . "\n");
194 print join ("\n", sort @{ $factory->available_analyses ($opt_C) }), "\n";
198 # --- here are methods describing one analysis
201 # print full analysis metadata in XML
202 # ('$service->describe' returns an XML string)
203 print $service->describe . "\n" if $opt_d;
205 # print major characteristics of an analysis
206 # ('$service->analysis_spec' returns a hash reference)
207 if ($opt_a) {
208 my $rh_spec = $service->analysis_spec;
209 my $msg = "Specification of analysis";
210 &msg ("$msg\n" . '-' x length ($msg) . "\n");
211 my ($key, $value);
212 print "Analysis '$opt_n':\n";
213 while (($key, $value) = each %{ $rh_spec }) {
214 print "\t$key => $value\n";
218 # print input specification (either full, or just input data names)
219 # ('$service->input_spec' returns a reference to an array of hashes)
220 if ($opt_i or $opt_I) {
221 my $ra_spec = $service->input_spec;
222 my $msg = "Specification of inputs";
223 &msg ("$msg\n" . '-' x length ($msg) . "\n");
224 my ($key, $value);
225 foreach (sort { $$a{'name'} cmp $$b{'name'} } @$ra_spec) {
226 print $$_{'name'},"\n";
227 if ($opt_I) {
228 while (($key, $value) = each %{ $_ }) {
229 unless ($key eq 'name') {
230 if (ref $value eq 'ARRAY') { # for 'allowed values'
231 print "\t$key => " . join (", ", @$value) . "\n";
232 } else {
233 print "\t$key => $value\n";
241 # print result specification (either full, or just names of results)
242 # ('$service->result_spec' returns a hash reference where keys are result names)
243 if ($opt_o or $opt_O) {
244 my $rh_spec = $service->result_spec;
245 my $msg = "Specification of results";
246 &msg ("$msg\n" . '-' x length ($msg) . "\n");
247 foreach (sort keys %{ $rh_spec }) {
248 print $_, ($opt_O ? "\t(of type: $$rh_spec{$_})" : ''), "\n";
253 # --- let's create a job
255 my $job;
256 if ($opt_j) {
257 # ... either by re-creating a previous job
258 $job = $service->create_job ($opt_j);
260 if ($opt_x) {
261 $job->run;
262 } elsif ($opt_w) {
263 $job->wait_for;
264 } elsif ($opt_k) {
265 $job->terminate;
268 } else {
269 # ... or creating a new job using given input data
270 if ($opt_x) {
271 $job = $service->run (\@ARGV);
272 } elsif ($opt_w) {
273 $job = $service->wait_for (\@ARGV);
274 } elsif ($opt_b or @ARGV > 0) {
275 $job = $service->create_job (\@ARGV);
278 # often you need to know the JOB's ID to be able to come back
279 # later and ask for results, status, events etc. - so I print it
280 # here even in quiet mode (option -q) - but to STDERR in order not
281 # to intervene with redirected real results
282 print STDERR "JOB ID: " , $job->id . "\n" if $job;
286 # --- having a job, ask it for something
288 if ($job) {
289 print "JOB STATUS: " . $job->status . "\n" if $opt_s;
290 print "LAST EVENT: " . $job->last_event . "\n" if $opt_e;
292 # ...get job times (all of them in one go, formatted)
293 if ($opt_t) {
294 my $rh_times = $job->times (1); # '1' means 'formatted'
295 print "TIMES:\n";
296 print "\tCreated: " . $$rh_times{'created'} . "\n" if $$rh_times{'created'};
297 print "\tStarted: " . $$rh_times{'started'} . "\n" if $$rh_times{'started'};
298 print "\tEnded: " . $$rh_times{'ended'} . "\n" if $$rh_times{'ended'};
299 print "\tElapsed: " . $$rh_times{'elapsed'} . "\n" if defined $$rh_times{'elapsed'};
302 # ...get individual job characteristics (both formatted and raw)
303 if ($opt_T) {
304 print "CREATED: " . $job->created (1) . " (" . $job->created . ")\n" if $opt_T =~ /a|c/;
305 print "STARTED: " . $job->started (1) . " (" . $job->started . ")\n" if $opt_T =~ /a|b/;
306 print "ENDED: " . $job->ended (1) . " (" . $job->ended . ")\n" if $opt_T =~ /a|f/;
307 print "ELAPSED: " . $job->elapsed . "\n" if $opt_T =~ /a|e/;
310 # retrieve results
311 my $rh_results;
312 if ($opt_R) {
313 $rh_results = $job->results (split /\s*,\s*/, $opt_R);
314 } elsif ($opt_r) {
315 $rh_results = $job->results ('?');
317 if ($rh_results) {
318 foreach my $name (sort keys %$rh_results) {
319 my $msg = "RESULT: $name";
320 &msg ("$msg\n" . '-' x length ($msg) . "\n");
322 if (ref $$rh_results{$name}) {
323 # ... this is probably what you do not want (binary on terminal);
324 # unless you wisely used: -R result_name=filename
325 print join ("\n", @{ $$rh_results{$name} }) . "\n";
326 } else {
327 print $$rh_results{$name} . "\n";
333 sub msg {
334 print shift unless $opt_q;
337 __END__
339 =head1 NAME
341 analysis.pl - An example/tutorial script how to access analysis tools
343 =head1 SYNOPSIS
345 # run an analysis with your sequence in a local file
346 ./analysis.pl -n 'edit::seqret'-w -r \
347 sequence_direct_data=@/home/testdata/my.seq
349 See more examples in the text below.
351 =head1 DESCRIPTION
353 A client showing how to use C<Bio::Tools::Run::Analysis> module, a module for
354 executing and controlling local or remote analysis tools. It also
355 calls methods from the C<Bio::Tools::Run::AnalysisFactory> module, a module
356 providing lists of available analyses.
358 Primarily, this client is meant as an example how to use analysis
359 modules, and also to test them. However, because it has a lot options
360 in order to cover as many methods as possible, it can be also used as
361 a fully functional command-line client for accessing various analysis
362 tools.
364 =head2 Defining location and access method
366 C<analysis.pl> is independent on the access method to the remote
367 analyses (the analyses running on a different machines). The method
368 used to communicate with the analyses is defined by the C<-A> option,
369 with the default value I<soap>. The other possible values (not yet
370 supported, but coming soon) are I<corba> and I<local>.
372 Each access method may have different meaning for parameter C<-l>
373 defining a location of services giving access to the analysis
374 tools. For example, the I<soap> access expects a URL of a Web Service
375 in the C<-l> option, while the I<corba> access may find here a
376 stringified Interoperable Object Reference (IOR).
378 A default location for the I<soap> access is
379 C<http://industry.ebi.ac.uk/soap/soaplab> which represents an
380 experimental service running at European Bioinformatics Institute on
381 top of over hundred EMBOSS analyses.
383 =head2 Available analyses
385 C<analysis.pl> can show a list of available analyses (from the given
386 location using given access method). The C<-L> option shows all
387 analyses, the C<-c> option lists all available categories (a category
388 is a group of analyses with similar functionality or processing
389 similar type of data), and finally the C<-C> option shows only
390 analyses available within the given category.
392 Note, that all these functions are provided by module
393 C<Bio::Tools::Run::AnalysisFactory> (respectively, by one of its
394 access-dependent sub-classes). The module has also a I<factory> method
395 C<create_analysis> which is not used by this script.
397 =head2 Service
399 A C<service> is a higher level of abstraction of an analysis tool. It
400 understands a well defined interface (module C<Bio::AnalysisI>, a fact
401 which allows this script to be independent on the access protocol to
402 various services.
404 The service name must be given by the C<-n> option. This option can be
405 omitted only if you invoked just the C<factory> methods (described
406 above).
408 Each service (representing an analysis tool, a program, or an
409 application) has its description, available by using options C<-a>
410 (analysis name, type, etc.), C<-i>, C<-I> (specification of analysis
411 input data, most important are their names), and C<-o>, C<-O> (result
412 names and their types). The option C<-d> gives the most detailed
413 description in the XML format.
415 The service description is nice but the most important is to use the
416 service for invoking an underlying analysis tool. For each invocation,
417 the service creates a C<job> and feeds it with input data. There are
418 three stages: (a) create a job, (b) run the job, and (c) wait for its
419 completion. Correspondingly. there are three options: the C<-b> which
420 just creates (builds) a job, the C<-x> which creates a job and
421 executes it, and finally C<-w> which creates a job, runs it and blocks
422 the client until the job is finished. Always only one of these options
423 is used (so it does not make sense to use more of them, the
424 C<analysis.pl> priorities them in the order C<-x>, C<-w>, and
425 C<-b>).
427 All of these options take input data from the command-line (see next
428 section about it) and all of them returns (internally) an object
429 representing a job. There are many methods (options) dealing with the
430 job objects (see one after next section about them).
432 Last note in this section: the C<-b> option is actually optional - a
433 job is created even without this option when there are some input data
434 found on the command-line. You I<have> to use it, however, if you do
435 not pass any data to an analysis tool (an example would be the famous
436 C<Classic::HelloWorld> service).
438 =head2 Input data
440 Input data are given as name/value pairs, put on the command-line with
441 equal sign between name and value. If the I<value> part starts with
442 an un-escaped character C<@>, it is used as a local file name and the
443 C<analysis.pl> reads the file and uses its contents instead. Examples:
445 analysis.pl -n edit::seqret -w -r
446 sequence_direct_data='tatatctcccc' osformat=embl
448 analysis.pl ...
449 sequence_direct_data=@/my/data/my.seq
451 The names of input data comes from the C<input specification> that can
452 be shown by the C<-i> or C<-I> options. The input specification (when
453 using option C<-I>) shows also - for some inputs - a list of allowed
454 values. The specification, however, does not tell what input data are
455 mutually exclusive, or what other constrains apply. If there is a
456 conflict, an error message is produces later (before the job starts).
458 Input data are used when any of the options C<-b>, C<-x>, or C<-w> is
459 present, but option C<-j> is not present (see next section about this
460 job option).
462 =head2 Job
464 Each service (defined by a name given in the C<-n> option) can be
465 executed one or more times, with the same, but usually with different
466 input data. Each execution creates a I<job object>, Actually, the job
467 is created even before execution (remember that option C<-b> builds a
468 job but does not execute it yet).
470 Any job, executed or not, is persistent and can be used again later
471 from another invocation of the C<analusis.pl> script. Unless you
472 explicitly destroy the job using option C<-z>.
474 A job created by options C<-b>, C<-x> and C<-w> (and by input data)
475 can be accessed in the same C<analysis.pl> invocation using various
476 job-related options, the most important are C<-r> and C<-R> for
477 retrieving results from the finished job.
479 However, you can also re-create a job created by a previous
480 invocation. Assuming that you know the job ID (the C<analysis.pl>
481 prints it always on the standard error when a new job is created), use
482 option C<-j> to re-create the job.
484 Example:
486 ./analysis.pl -n 'edit::seqret'
487 sequence_direct_data=@/home/testdata/my.seq
489 It prints:
491 JOB ID: edit::seqret/bb494b:ef55e47c99:-8000
493 Next invocation (asking to run the job, to wait for its completion and
494 to show job status) can be:
496 ./analysis.pl -n 'edit::seqret'
497 -j edit::seqret/bb494b:ef55e47c99:-800
498 -w -s
500 And again later another invocation can ask for results:
502 ./analysis.pl -n 'edit::seqret'
503 -j edit::seqret/bb494b:ef55e47c99:-800
506 Here is a list of all job options (except for results, they are in the
507 next section):
509 =over
511 =item Job execution and termination
513 There are the same options C<-x> and C<-w> for executing a job and for
514 executing it and waiting for its completion, as they were described
515 above. But now, the options act on a job given by the C<-j> option,
516 now they do not use any input data from the command-line (the input
517 data had to be used when the job was created).
519 Additionally, there is a C<-k> option to kill a running job.
521 =item Job characteristics
523 Other options tell about the job status (C<-s>, about the job
524 execution times (C<-t> and C<-T>, and about the last available event
525 what happened with the job (C<-e>). Note that the event notification is
526 not yet fully implemented, so this option will change in the future to
527 reflect more notification capabilities.
529 =back
531 =head2 Results
533 Of course, the most important on the analysis tools are their
534 results. The results are named (in the similar way as the input data)
535 and they can be retrieved all in one go using option C<-r> (so you do
536 not need to know their names actually), or by specifying (all or some)
537 result names using the C<-R> option.
539 If a result does not exist (either not yet, or the name is wrong) an
540 undef value is returned (no error message produced).
542 Some results are better to save directly into files instead to show
543 them in the terminal window (this applies to the I<binary> results,
544 mostly containing images). The C<analysis.pl> helps to deal with
545 binary results by saving them automatically to local files (actually
546 it is the module C<Bio::Tools::Run::Analysis> and its submodules
547 who do help with the binary data).
549 So why not to use a traditional shell re-direction to a file? They are
550 two reasons. First, a job can produce more than one result, so they
551 would be mixed together. But mainly, because each result can consist
552 of several parts whose number is not known in advance and which cannot
553 be mixed together in one file. Again, this is typical for the binary
554 data returning images - an invocation can produce many images.
556 The C<-r> option retrieves all available results and treat them as
557 described by the C<'?'> format below.
559 The C<-R> option has a comma-separated list of result names, each of
560 the names can be either a simple name (as specified by the C<result
561 specification> obtainable using the C<-o> or C<-O> options), or a
562 equal-sign-separated name/format construct suggesting what to do with
563 the result. The possibilities are:
565 =over
567 =item result-name
569 It prints given result on the standard output.
571 =item result-name=filename
573 It saves the given result into given file.
575 =item result-name=@
577 It saves the given result into a file whose name is automatically
578 invented, and it guarantees that the same name will not be used in
579 the next invocation.
581 =item result=name=@template
583 It saves the given result into a file whose name is given by the
584 C<template>. The template can contain several strings which are
585 substituted before using it as the filename:
587 =over
589 =item Any '*'
591 Will be replaced by a unique number
593 =item $ANALYSIS or ${ANALYSIS}
595 Will be replaced by the current analysis name
597 =item $RESULT or ${RESULT}
599 Will be replaced by the current result name
601 How to tell what to do with results? Each result name
603 =back
605 Additionally, a template can be given as an environment variable
606 C<RESULT_FILENAME_TEMPLATE>. Such variable is used for any result
607 having in its format a simple C<?> or C<@> character.
609 =item result-name=?
611 It first decides whether the given result is binary or not. Then, the
612 binary results are saved into local files whose names are
613 automatically invented, the other results are sent to the standard
614 output.
616 =item result-name=?template
618 The same as above but the filenames for binary files are deduced from
619 the given template (using the same rules as described above).
621 =back
623 Examples;
626 -R report
627 -R report,outseq
628 -R Graphics_in_PNG=@
629 -R Graphics_in_PNG=@$ANALYSIS-*-$RESULT
631 Note that the result formatting will be enriched in the future by
632 using existing data type parsers in bioperl.
634 =head1 FEEDBACK
636 =head2 Mailing Lists
638 User feedback is an integral part of the evolution of this and other
639 Bioperl modules. Send your comments and suggestions preferably to
640 the Bioperl mailing list. Your participation is much appreciated.
642 bioperl-l@bioperl.org - General discussion
643 http://bioperl.org/MailList.shtml - About the mailing lists
645 =head2 Reporting Bugs
647 Report bugs to the Bioperl bug tracking system to help us keep track
648 of the bugs and their resolution. Bug reports can be submitted via
649 email or the web:
651 bioperl-bugs@bioperl.org
652 http://bioperl.org/bioperl-bugs/
654 =head1 AUTHOR
656 Martin Senger (senger@ebi.ac.uk)
658 =head1 COPYRIGHT
660 Copyright (c) 2003, Martin Senger and EMBL-EBI.
661 All Rights Reserved.
663 This script is free software; you can redistribute it and/or modify
664 it under the same terms as Perl itself.
666 =head1 DISCLAIMER
668 This software is provided "as is" without warranty of any kind.
670 =head1 BUGS AND LIMITATIONS
672 None known at the time of writing this.
674 =cut