hmmer.t: Fixed minor mistake
[bioperl-live.git] / scripts / DB-HIV / bp_hivq.pl
blob73f7d3e566f5cb4ed0e6d12f340f0ed84c0e91c2
1 #!perl
4 # command-line script for HIV sequence queries
5 # using HIV.pm and HIVQuery.pm
7 =head1 NAME
9 bp_hivq.PL - an interactive command-line interface to L<Bio::DB::HIV> and L<Bio::DB::Query::HIVQuery>
11 =head1 SYNOPSIS
13 $ perl bp_hivq.PL
14 hivq> query C[subtype] SI[phenotype]
15 hivq> prerun
16 80 sequences returned
17 Query: C[subtype] SI[phenotype]
18 hivq> outfile csi.fas
19 hivq> run
20 Download complete.
21 hivq> outfile dsi.fas
22 hivq> run D[subtype] SI[phenotype]
23 Download complete.
24 hivq> count
25 25 sequences returned
26 Query: D[subtype] SI[phenotype]
27 hivq> exit
30 =head1 DESCRIPTION
32 The BioPerl modules L<Bio::DB::HIV> and L<Bio::DB::Query::HIVQuery>
33 together allow batch queries against the Los Alamos National
34 Laboratories' HIV Sequence Database using a simple query
35 language. C<bp_hivq.PL> provides both an example of the use of these
36 modules, and a standalone interactive command-line interface to the
37 LANL HIV DB. Simple commands allow the user to retrieve HIV sequences
38 and annotations using the query language implemented in
39 L<Bio::DB::Query::HIVQuery>. Visit the man pages for those modules for
40 more details.
42 =head1 USAGE
44 Run the script using C<perl bp_hivq.PL> or, in Unix, C<./bp_hivq.PL>. You will see
45 the
47 hivq>
49 prompt. Type commands with queries to retrieve sequence and annotation
50 data. See the SYNOPSIS for a sample session. Available commands
51 are described below.
53 =head2 TIPS
55 The LANL database is pretty complex and extensive. Use the C<find> facility to
56 explore the available database tables and fields. To identify aliases for a particular field, use C<find alias [fieldname]>. For example, to find a short alias to the
57 weirdly named field C<seq_sample.ssam_second_receptor>, do
59 hivq> find alias seq_sample.ssam_second_receptor
61 which returns
63 coreceptor second_receptor
65 Now, instead of the following query
67 hivq> run C[subtype] CCR5[seq_sample.ssam_second_receptor]
69 you know you can do
71 hivq> run C[subtype] CCR5[coreceptor]
73 Use the C<outfile> command to set the file that receives the retrieved
74 sequences. You can change the current output file simply by issuing a
75 new C<outfile> command during the session. The output file defaults to
76 standard output.
78 Use the C<query> command to validate a query without hitting the
79 database. Use the C<prerun> or C<count> commands to get a count of
80 sequence hits for a query without retrieving the data. Use C<run> or
81 C<do> to perform a complete query, retrieving sequence data into the
82 currently set output files.
84 To process C<bp_hivq.PL> commands in batch, create a text file
85 (C<bp_hivq.cmd>, for example) containing desired commands one per
86 line. Then execute the following from the shell:
88 $ cat bp_hivq.cmd | perl bp_hivq.PL
90 =head1 COMMANDS
92 Here is a complete list of commands. Options in single brackets (C<[req_option]>) are
93 required; options in double brackets (C<[[opt_option]]>) are optional.
95 confirm : Toggle interactive confirmation before
96 executing queries
97 exit : Quit script
98 find : Explore database schema
99 find tables Display all database tables
100 find fields Display all database fields (columns)
101 find fields [table] Display all fields in [table]
102 find alias [field] Display valid aliases for [field]
103 help [[command]] : Show command help
104 if [[command]] not specified, list all
105 available commands
106 id : Display current session id
107 outfile [filename] : Set file for collecting retrieved data
108 ping : Check if LANL DB is available
109 prerun [[query]] : Execute query but retreive hit count only
110 if [[query]] not specified, use current query
111 query [query] : Validate and set current query
112 run [[query]] : Execute query and retrieve data
113 if [[query]] not specified, use current query
114 state : Display current state of the script
116 bye : Alias for 'exit'
117 config : Alias for 'state'
118 count : Alias for 'prerun'
119 do : Alias for 'run'
120 out : Alias for 'outfile'
121 quit : Alias for 'exit'
123 =head1 OPTIONS
125 -v : verbose; turns on the internal debug() function
127 =head1 FEEDBACK
129 =head2 Mailing Lists
131 User feedback is an integral part of the evolution of this and other
132 Bioperl modules. Send your comments and suggestions preferably to
133 the Bioperl mailing list. Your participation is much appreciated.
135 bioperl-l@bioperl.org - General discussion
136 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
138 =head2 Reporting Bugs
140 Report bugs to the Bioperl bug tracking system to help us keep track
141 of the bugs and their resolution. Bug reports can be submitted via the
142 web:
144 https://github.com/bioperl/bioperl-live/issues
146 =head1 AUTHOR - Mark A. Jensen
148 Mark A. Jensen E<lt>maj@fortinbras.usE<gt>
150 =cut
152 $|=1;
153 use strict;
154 use warnings;
155 use Term::ReadLine;
156 use Bio::DB::HIV;
157 use Bio::DB::Query::HIVQuery;
158 use Bio::SeqIO;
159 use Error qw(:try);
161 use Getopt::Std;
162 our ($opt_v);
163 getopts('v');
164 my $db = new Bio::DB::HIV;
166 my $q = new Bio::DB::Query::HIVQuery();
167 my $schema = $q->_schema;
168 my $seqio;
169 my @cmds = qw(
171 config
172 confirm
173 count
175 exit
176 find
177 help
180 outfile
181 ping
182 prerun
183 query
184 quit
186 state
189 my %state = (
190 'confirm' => 0,
191 'outfile' => {'text'=>'[stdout]','handle'=>\*STDOUT},
192 'query' => '',
193 'session_id' => '',
194 'timeout' => '',
195 'curct' => 0
198 my %help = (
199 'bye' => 'bye: End script',
200 'config' => 'config: Print current configuration state of hivq',
201 'confirm' => 'confirm: Toggle user confirmation of query execution',
202 'count' => 'count [[query string]]: Get number of sequences available for query',
203 'do' => 'do [[query string]]: Run query to completion',
204 'exit' => 'exit: End script',
205 'find' => join("\n","find tables: Print all LANL table names",
206 "find fields: Print all field names",
207 "find fields [tablename]: Print all fields in specified table",
208 "find aliases [fieldname]: Print all valid aliases for specified field",
209 "find options [fieldname/alias]: Print valid match options for specified field"),
210 'help' => 'help [[command]]: Get description of command',
211 'id' => 'id: Print current session id',
212 'out' => 'out [output filename]: Set filename to hold returned sequences',
213 'outfile' => 'outfile [output filename]: Set filename to hold returned sequences',
214 'ping' => 'ping: Ping LANL database',
215 'prerun' => 'prerun [[query string]]: Get number of sequences available',
216 'query' => 'query [[query string]]: Validate and set current query string',
217 'quit' => 'quit: End script',
218 'run' => 'run [[query string]]: Run query to completion',
219 'state' => 'state: Print current configuration state of hivq',
221 my $done = 0;
222 debug("We have arrived.");
223 #### main loop
224 my ($cmd, @tok);
225 my $prompt = "hivq> ";
226 my $term = new Term::ReadLine 'hivq';
227 CMD:
228 while ( !$done ) {
229 @tok=();$cmd='';
230 for ( $cmd = $term->readline($prompt) ) {
231 chomp;
232 s/^\s+//; #trim whsp
233 $term->addhistory($_);
234 @tok = split(/\s+/);
235 $tok[0] = lc $tok[0]; # normalize
236 for ($tok[0]) {
237 (!$_ || /^\#/) && do {
238 next CMD;
240 !(grep(/^$tok[0]$/, @cmds)) && do {
241 error("Unrecognized command \'$tok[0]\'");
242 next CMD;
244 (m/^quit$/ || m/^bye$/ || m/^exit$/) && do {
245 my $fh = $state{outfile}->{handle};
246 unless ($fh == \*STDOUT) {
247 close $fh;
249 $done=1;
250 next CMD;
252 m/^confirm$/ && do {
253 $state{confirm} = !$state{confirm};
254 print "User confirmation before running query: ". ($state{confirm} ? "yes" : "no")."\n";
255 next CMD;
257 (m/^outfile$/ || m/^out$/) && do {
258 unless ($tok[1]) {
259 error('Output filename not specified');
260 next CMD;
262 my $fh;
263 eval {
264 open $fh, ">$tok[1]" or die $!;
266 if ($@) {
267 error($@);
269 else {
270 my $oldfh = $state{outfile}->{handle};
271 if ($state{outfile}->{text} ne '[stdout]') {
272 close($oldfh);
274 $state{outfile}->{handle} = $fh;
275 $state{outfile}->{text} = $tok[1];
277 next CMD;
279 m/^query$/ && do {
280 my $query = join(' ', @tok[1..$#tok]);
281 if ($query) {
282 $q->_reset;
283 $q->query($query);
285 else {
286 if ($state{query}) {
287 print "Current query: $state{query}\n";
289 else {
290 print "No query set\n";
292 next CMD;
294 try { #catch errors
295 $q->_do_query(0);
297 catch Error with {
298 my $E = shift;
299 error($E->text);
300 next CMD;
302 $state{session_id} = $q->_session_id;
303 $state{query} = $query;
304 next CMD;
306 (m/^prerun$/ || m/^count$/) && do {
307 my $query = join(' ', @tok[1..$#tok]);
308 if (!$query) {
309 if (!$q->query()) {
310 error('No query currently set');
311 next CMD;
313 elsif ($q->{'_RUN_LEVEL'} >= 1 &&
314 ($q->query() eq $state{query})) {
315 outputPrerun();
316 next CMD;
318 # else, fallthrough to prerun current query
320 else { # new query
321 $q->_reset;
322 $q->query($query);
323 try {
324 $q->_do_query(0);
326 catch Error with {
327 my $E = shift;
328 error($E->text);
329 next CMD;
332 # on success:
333 $state{confirm} && do { next CMD unless getConfirm(); };
334 $state{session_id} = $q->_session_id;
335 $state{query} = $query;
336 try {
337 $q->_do_query(1);
339 catch Error with {
340 my $E = shift;
341 error($E->text);
342 next CMD;
344 # on success:
345 $state{curct} = $q->count;
346 $state{query} = $q->query;
347 outputPrerun();
348 next CMD;
351 (m/^run$/ || m/^do$/) && do {
352 my $query = join(' ', @tok[1..$#tok]);
353 if (!$query) {
354 unless (defined $q->{'_RUN_LEVEL'} && $q->{'_RUN_LEVEL'} < 2) {
355 error('No query currently set');
356 next CMD;
359 else { # new query
360 $q->_reset;
361 $q->query($query);
362 try {
363 $q->_do_query(0);
365 catch Error with {
366 my $E = shift;
367 error($E->text);
368 next CMD;
370 $state{query} = $query;
371 $state{curct} = 0;
372 $state{session_id} = $q->_session_id;
374 # on success:
375 $state{confirm} && do { next CMD unless getConfirm(); };
376 try {
377 $q->_do_query(2);
378 $seqio = $db->get_Stream_by_query($q);
380 catch Error with {
381 my $E = shift;
382 error($E->text);
383 next CMD;
385 # on success:
386 $state{curct} = $q->count;
387 # output the seqs
388 if ($q->count) {
389 outputSeqs($seqio);
391 else {
392 error('No sequences returned');
394 next CMD;
396 m/^find$/ && do {
397 outputFind(@tok);
398 next CMD;
400 m/^help$/ && do {
401 unless ($tok[1]) {
402 outputUsage();
403 next CMD;
405 if (grep(/$tok[1]/, @cmds)) {
406 print $help{$tok[1]}, "\n";
408 else {
409 error("Command \'$tok[1]\' unrecognized\n");
411 next CMD;
413 (m/^state$/ || m/^config$/) && do {
414 foreach my $k (sort keys %state) {
415 if (ref($state{$k}) eq 'HASH') {
416 print "$k: ".$state{$k}->{text}."\n";
418 else {
419 print "$k: ";
420 if ($k eq 'confirm') {
421 print $state{$k} ? "yes" : "no";
422 print "\n";
424 else {
425 print "$state{$k}\n";
429 next CMD;
431 do {
432 error("Command currently unimplemented\n");
433 next CMD;
439 ### end main
441 sub error {
442 my $msg = shift;
443 if ($msg =~ /MSG:/) {
444 ($msg) = grep (/^MSG:/, split(/\n|\r/,$msg));
445 $msg =~ s/^MSG: *//;
446 $msg =~ s/\sat\s.*$//;
448 print STDERR "hivq: $msg\n";
449 return 1;
452 sub outputPrerun {
453 print (($state{curct} ? $state{curct} : "No")
454 . " sequence"
455 . ($state{curct}>1 ? "s" : "")
456 . " returned\n");
457 print "Query: ".$state{query}."\n";
458 return 1;
461 sub outputSeqs {
462 my ($seqio) = @_;
463 my @ret;
464 while (my $seq = $seqio->next_seq) {
465 my $nameline = ">";
466 $nameline .= $seq->annotation->get_value('Special', 'accession').
467 '('.$seq->id.') ';
468 # loop through categories:
469 foreach my $cat ($seq->annotation->get_keys()) {
470 foreach my $an ($seq->annotation->get_keys($cat)) {
471 next if ($an eq 'accession');
472 my $value = $seq->annotation->get_value($cat, $an);
473 # next line: kludge to skip if there's an annotation
474 # object instead of a value (I believe this is a bug)
475 next if ref($value);
476 $nameline .= "\t".join('=', "'$an'", "'".$value."'");
479 push @ret, $nameline."\n";
480 push @ret, $seq->seq()."\n";
482 doOutputSeqs(@ret);
485 sub doOutputSeqs {
486 my @lines = @_;
487 if (!@lines) {
488 error("No sequences to output");
490 my $fh = $state{outfile}->{handle};
491 print $fh @lines;
492 print "Download complete\n";
493 return 1;
496 sub getConfirm {
497 print "Run query? [y/n]";
498 my $ans = <STDIN>;
499 ($ans =~ /^[yY]/) && do {return 1;};
500 return 0;
503 sub outputUsage {
504 print "Available commands:\n";
505 outputInColumns(\@cmds);
506 return 1;
509 sub outputFind {
510 my @tok = @_;
511 for my $arg ($tok[1]) {
512 !$arg && do {
513 print $help{find},"\n";
514 return;
516 ($arg =~ m/^tables$/) && do {
517 outputInColumns([$schema->tables]);
518 return;
520 ($arg =~ m/^fields$/) && do {
521 my $tbl = $tok[2];
522 if (!$tbl) {
523 outputInColumns([$schema->fields]);
525 else {
526 unless (grep /^$tbl$/, $schema->tables) {
527 error("Table \'$tbl\' not valid");
528 return;
530 outputInColumns([grep /^$tbl/, $schema->fields()]);
532 return;
534 ($arg =~ /^options$/) && do {
535 my $fld = $tok[2];
536 my @aliases = $schema->aliases($fld);
537 if (!@aliases) {
538 unless (grep /^$fld$/, $schema->aliases) {
539 error("Field \'$tok[2]\' not valid");
540 return;
542 foreach ($schema->fields) {
543 if ( grep /$fld/, $schema->aliases($_) ) {
544 $fld = $_;
545 last;
548 # on success: $fld is set to valid field
550 my @options = sort {$a cmp $b} $schema->options($fld);
551 @options = (@options ? @options : ('Free text'));
552 outputInColumns(\@options);
553 return;
555 do {
556 ($arg =~ /^alias/) && ($arg = $tok[2]);
557 if (grep /$arg/, $schema->fields) {
558 my @aliases = sort $schema->aliases($arg);
559 if (@aliases) {
560 outputInColumns(\@aliases);
562 else {
563 error("No aliases to field \'$arg\'");
566 else {
567 error("Field \'$arg\' not valid");
569 return;
577 sub outputInColumns {
578 my ($items, $n, $w) = @_;
579 my $i = 0;
580 my @items = @$items;
581 my $maxl = length([sort {length($b)<=>length($a)} @items]->[0]);
582 $n ||= int(60/($maxl+3)) || 1;
583 $w ||= $maxl+3;
584 my $coll = int(@items/$n);
585 $coll == @items/$n ? $coll : ++$coll;
586 my @t;
587 for my $j (0..$n-1) {
588 if ($j<$n-1) {
589 $t[$j] = [@items[$j*$coll..$j*$coll+$coll-1]];
591 else {
592 $t[$j] = [@items[$j*$coll..$#items]];
595 @items = map { my $j = $_; map { $t[$_]->[$j] || () } (0..$#t) } (0..scalar(@{$t[0]})-1);
596 foreach (@items) {
597 $_ .= (' ' x ($w-length($_)));
599 while ($i < @items) {
600 print join('', map { $_ || () } @items[$i..$i+$n-1] ),"\n";
601 $i+=$n;
603 return 1;
606 sub debug {
607 print STDERR shift()."\n" if $opt_v;
608 return;