1 # to do: support for comment, reference annotations
3 # $Id: HIVQuery.pm 232 2008-12-11 14:51:51Z maj $
5 # BioPerl module for Bio::DB::Query::LANLQuery
7 # Cared for by Mark A. Jensen <maj@fortinbras.us>
9 # Copyright Mark A. Jensen
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::DB::Query::HIVQuery - Query interface to the Los Alamos HIV Sequence Database
21 $q = new Bio::DB::Query::HIVQuery(" C[subtype] ZA[country] CXCR4[coreceptor] ");
22 $q = new Bio::DB::Query::HIVQuery(
23 -query=>{'subtype'=>'C',
25 'coreceptor'=>'CXCR4'});
27 $ac = $q->get_annotations_by_id(($q->ids)[0]);
28 $ac->get_value('Geo', 'country') # returns 'SOUTH AFRICA'
30 $db = new Bio::DB::HIV();
31 $seqio = $db->get_Stream_by_query($q); # returns annotated Bio::Seqs
33 # get subtype C sequences from South Africa and Brazil,
34 # with associated info on patient health, coreceptor use, and
37 $q = new Bio::DB::Query::HIVQuery(
39 'query' => {'subtype'=>'C',
40 'country'=>['ZA', 'BR']},
41 'annot' => ['patient_health',
43 'days_post_infection']
49 Bio::DB::Query::HIVQuery provides a query-like interface to the
50 cgi-based Los Alamos National Laboratory (LANL) HIV Sequence
51 Database. It uses Bioperl facilities to capture both sequences and
52 annotations in batch in an automated and computable way. Use with
53 L<Bio::DB::HIV> to create C<Bio::Seq> objects and annotated C<Bio::SeqIO>
58 The interface implements a simple query language emulation that understands AND,
59 OR, and parenthetical nesting. The basic query unit is
61 (match1 match2 ...)[fieldname]
63 Sequences are returned for which C<fieldname> equals C<match1 OR match2 OR ...>.
64 These units can be combined with AND, OR and parentheses. For example:
66 (B, C)[subtype] AND (2000, 2001, 2002, 2003)[year] AND ((CN)[country] OR (ZA)[country])
68 which can be shortened to
70 (B C)[subtype] (2000 2001 2002 2003)[year] (CN ZA)[country]
72 The user can specify annotation fields, that do not restrict the query, but
73 arrange for the return of the associated field data for each sequence returned.
74 Specify annotation fields between curly braces, as in:
76 (B C)[subtype] 2000[year] {country cd4_count cd8_count}
78 Annotations can be accessed off the query using methods described in APPENDIX.
80 =head2 Hash specifications for query construction
82 Single query specifications can be made as hash references provided to the
83 C<-query> argument of the constructor. There are two forms:
85 -query => { 'country'=>'BR', 'phenotype'=>'NSI', 'cd4_count'=>'Any' }
89 -query => [ 'country'=>'BR', 'phenotype'=>'NSI', 'cd4_count'=>'Any' ]
93 -query => { 'query' => {'country'=>'BR', 'phenotype'=>'NSI'},
94 'annot' => ['cd4_count'] }
96 In both cases, the CD4 count is included in the annotations returned, but does
97 not restrict the rest of the query.
99 To 'OR' multiple values of a field, use an anonymous array ref:
101 -query => { 'country'=>['ZA','BR','NL'], 'subtype'=>['A', 'C', 'D'] }
103 =head2 Valid query field names
105 An attempt was made to make the query field names natural and easy to
106 remember. Aliases are specified in an XML file (C<lanl-schema.xml>) that is part
107 of the distribution. Custom field aliases can be set up by modifying this file.
109 An HTML cheatsheet with valid field names, aliases, and match data can
110 be generated from the XML by using
111 C<hiv_object-E<gt>help('help.html')>. A query can also be validated
112 locally before it is unleashed on the server; see below.
116 LANL DB annotations have been organized into a number of natural
117 groupings, tagged C<Geo>, C<Patient>, C<Virus>, and <StdMap>. After a
118 successful query, each id is associated with a tree of
119 L<Bio::Annotation::SimpleValue> objects. These can be accessed with
120 methods C<get_value()> and C<put_value()> described in APPENDIX.
122 =head2 Delayed/partial query runs
124 Accessing the LANL DB involves multiple HTTP requests. The query can
125 be instructed to proceed through all (the default) or only some of
126 them, using the named parameter C<RUN_OPTION>.
128 To validate a query locally, use
130 $q = new Bio::DB::Query::HIVQuery( -query => {...}, -RUN_OPTION=>0 )
132 which will throw an exception if a field name or option is invalid.
134 To get a query count only, you can save a server hit by using
136 $q = new Bio::DB::Query::HIVQuery( -query => {...}, -RUN_OPTION=>1 )
138 and asking for C<$q-E<gt>count>. To finish the query, do
142 which picks up where you left off.
144 C<-RUN_OPTION=E<gt>2>, the default, runs the full query, returning ids and
149 You can clear the query results, retaining the same LANL session and query spec,
150 by doing C<$q-E<gt>_reset>. Change the query, and rerun with
151 C<$q-E<gt>_do_query($YOUR_RUN_OPTION)>.
157 User feedback is an integral part of the evolution of this and other
158 Bioperl modules. Send your comments and suggestions preferably to
159 the Bioperl mailing list. Your participation is much appreciated.
161 bioperl-l@bioperl.org - General discussion
162 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 of the bugs and their resolution. Bug reports can be submitted via
170 http://bugzilla.open-bio.org/
172 =head1 AUTHOR - Mark A. Jensen
174 Email maj@fortinbras.us
180 The rest of the documentation details each of the object methods.
181 Internal methods are usually preceded with a _
185 # Let the code begin...
187 package Bio
::DB
::Query
::HIVQuery
;
189 use vars
qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH $SCHEMA_FILE $RUN_OPTION );
191 # Object preamble - inherits from Bio::DB::QueryI
193 use Bio::Annotation::Collection;
194 use Bio::Annotation::Comment;
195 use Bio::Annotation::Reference;
200 use Bio::DB::HIV::HIVQueryHelper;
202 use base qw(Bio::Root::Root Bio::DB::QueryI);
206 $LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/advanced_search";
207 $LANL_MAP_DB = "map_db.comp";
208 $LANL_MAKE_SEARCH_IF = "make_search_if.comp";
209 $LANL_SEARCH = "search.comp";
210 $SCHEMA_FILE = Bio
::Root
::IO
->catfile(qw(Bio DB HIV lanl-schema.xml));
211 $RUN_OPTION = 2; # execute query
213 @Bio::SchemaNotInit
::Exception
::ISA
= qw( Bio::Root::Exception );
214 @Bio::WebError
::Exception
::ISA
= qw( Bio::Root::Exception );
215 @Bio::QueryNotMade
::Exception
::ISA
= qw( Bio::Root::Exception );
216 @Bio::QueryStringException
::Exception
::ISA
= qw( Bio::Root::Exception );
217 @Bio::HIVSorry
::Exception
::ISA
= qw
( Bio
::Root
::Exception
);
226 Usage : my $hiv_query = new Bio::DB::Query::HIVQuery();
227 Function: Builds a new Bio::DB::Query::HIVQuery object,
228 running a sequence query against the Los Alamos
229 HIV sequence database
230 Returns : an instance of Bio::DB::Query::HIVQuery
236 my($class,@args) = @_;
237 my $self = $class->SUPER::new
(@args);
238 my ($query, $ids, $lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search, $schema_file,$run_option) =
239 $self->_rearrange([ qw(QUERY
250 $lanl_base||= $LANL_BASE;
251 $lanl_map_db||=$LANL_MAP_DB;
252 $lanl_make_search_if||=$LANL_MAKE_SEARCH_IF;
253 $lanl_search||=$LANL_SEARCH;
254 $schema_file||=$SCHEMA_FILE;
255 defined $run_option || ($run_option = $RUN_OPTION);
257 $self->lanl_base($lanl_base);
258 $self->map_db($lanl_map_db);
259 $self->make_search_if($lanl_make_search_if);
260 $self->search_($lanl_search);
261 $self->_run_option($run_option);
263 # catch this at the top
264 if (-e
$schema_file) {
265 $self->_schema_file($schema_file);
268 my ($p) = $self->_schema_file( [grep {$_} map {
269 my $p = Bio
::Root
::IO
->catfile($_, $schema_file);
272 $self->throw(-class=>"Bio::Root::NoSuchThing",
273 -text
=>"Schema file \"".$self->_schema_file."\" cannot be found",
274 -value
=>$self->_schema_file) unless -e
$self->_schema_file;
278 $self->{_schema
} = HIVSchema
->new($self->_schema_file);
280 # internal storage and flags
281 $self->{'_lanl_query'} = [];
282 $self->{'_lanl_response'} = [];
283 $self->{'_annotations'} = {}; # container for annotation collections assoc. with ids
284 $self->{'_RUN_LEVEL'} = undef; # set in _do_query()
287 defined $query && $self->query($query);
288 defined $ids && $self->ids($ids);
292 $self->_do_query($self->_run_option) if $self->query;
297 =head1 QueryI compliance
302 Usage : $hiv_query->count($newval)
303 Function: return number of sequences found
305 Returns : value of count (a scalar)
306 Args : on set, new value (a scalar or undef, optional)
307 Note : count warns if it is accessed for reading before query
308 has been executed to at least level 1
314 return $self->{'count'} = shift if @_;
315 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 1)) {
316 $self->warn('Query not yet run at > level 1');
318 return $self->{'count'};
324 Usage : $hiv_query->ids($newval)
325 Function: LANL ids of returned sequences
327 Returns : value of ids (an arrayref of sequence accessions/ids)
328 Args : on set, new value (an arrayref or undef, optional)
336 $self->throw(-class=>'Bio::Root::BadParameter',
337 -text
=>'Arrayref required',
338 -value
=> ref $a) unless ref($a) eq 'ARRAY';
339 @
{$self->{'ids'}}{@
$a} = (1) x @
$a;
342 return keys %{$self->{'ids'}} if $self->{'ids'};
348 Usage : $hiv_query->query
349 Function: Get/set the submitted query hash or string
351 Returns : hashref or string
352 Args : query in hash or string form (see DESCRIPTION)
358 return $self->{'query'} = shift if @_;
359 return $self->{'query'};
362 =head1 Bio::DB::Query::HIVQuery specific methods
367 Usage : $hiv_query->help("help.html")
368 Function: get html-formatted listing of valid fields/aliases/options
369 based on current schema xml
370 Example : perl -MBio::DB::Query::HIVQuery -e "new Bio::DB::Query::HIVQuery()->help" | lynx -stdin
372 Args : optional filename; otherwise prints to stdout
377 my ($self, $fname) = @_;
379 my $schema = $self->_schema;
382 my (@tbls, @flds, @als, @opts, $fh);
384 open ($fh, ">", $fname) or $self->throw(-class=>'Bio::Root::IOException',
385 -text
=>"Error opening help html file $fname for writing",
391 @tbls = $schema->tables;
393 $h->start_html(-title
=>"HIVQuery Help")
395 print $fh $h->a({-id
=>'TOP'}, $h->h2("Valid <span style='font-variant:small-caps'>Bio::DB::Query::HIVQuery</span> query fields and match data"));
396 print $fh "Fields are organized below according to their Los Alamos HIV database tables. Use aliases in place of full field names in queries; for example:<br/>";
397 print $fh "<blockquote><code> \$hiv_query->query( '(CCR5 CXCR4)[coreceptor] ' ); </code></blockquote><br/>";
398 print $fh "rather than <br/>";
399 print $fh "<blockquote><code> \$hiv_query->query( '(CCR5 CXCR4)[SEQ_SAMple.SSAM_second_receptor]' );</code></blockquote><br/>";
400 print $fh "(which does work, however). Click hyperlinks to see valid search options within the field. The token <code><b>Any</b></code> is the wildcard for all fields.<br/><br/>";
401 print $fh $h->start_table({-style
=>"font-family:sans-serif;"}) ;
402 foreach my $tbl (@tbls) {
403 @flds = grep /^$tbl/, $schema->fields;
404 @flds = grep !/_id/, @flds;
406 $h->start_Tr({-style
=>"background-color: lightblue;"}),
407 $h->td([$h->a({-id
=>$tbl},$tbl), $h->span({-style
=>"font-style:italic"},"field aliases")]),
410 foreach my $fld (@flds) {
411 @als = reverse $schema->aliases($fld);
413 $h->Tr( $h->td( ["", $h->a({-href
=>"#opt$fld"}, shift @als)] ))
415 my @tmp = grep {$_} $schema->options($fld);
416 #print STDERR join(', ',@tmp)."\n";
418 {-style
=>"font-family:sans-serif;font-size:small"},
422 "<i>Valid options for</i> <b>$fld</b>: "),
424 @tmp ?
$h->code(join(", ", @tmp)) : $h->i("free text")
427 "<i>Other aliases</i>: "),
429 @als ?
$h->code(join(",",@als)) : "<i>none</i>"
434 $h->a({-href
=>"#$tbl"}, $h->small('BACK')),
435 $h->a({-href
=>"#TOP"}, $h->small('TOP'))
441 print $fh $h->end_table;
443 print $fh $h->end_html;
448 =head1 Annotation manipulation methods
450 =head2 get_annotations_by_ids
452 Title : get_annotations_by_ids (or ..._by_id)
453 Usage : $ac = $hiv_query->get_annotations_by_ids(@ids)
454 Function: Get the Bio::Annotation::Collection for these sequence ids
456 Returns : A Bio::Annotation::Collection object
457 Args : an array of sequence ids
461 sub get_annotations_by_ids
{
465 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
466 $self->warn('Requires query run at level 2');
469 @ret = map {$self->{'_annotations'}->{$_}} @ids if exists($self->{'_annotations'});
471 return (wantarray ?
@ret : $ret[0]) if @ret;
476 sub get_annotations_by_id
{
477 shift->get_annotations_by_ids(@_);
480 =head2 add_annotations_for_id
482 Title : add_annotations_for_id
483 Usage : $hiv_query->add_annotations_for_id( $id ) to create a new
484 empty collection for $id
485 $hiv_query->add_annotations_for_id( $id, $ac ) to associate
487 Function: Associate a Bio::Annotation::Collection with this sequence id
489 Returns : a Bio::Annotation::Collection object
490 Args : sequence id [, Bio::Annotation::Collection object]
494 sub add_annotations_for_id
{
497 $ac = new Bio
::Annotation
::Collection
unless defined $ac;
498 $self->throw(-class=>'Bio::Root::BadParameter'
499 -text
=>'Bio::Annotation::Collection required at arg 2',
500 -value
=>"") unless ref($ac) eq 'Bio::Annotation::Collection';
502 $self->{'_annotations'}->{$id} = $ac unless exists($self->{'_annotations'}->{$id});
506 =head2 remove_annotations_for_ids
508 Title : remove_annotations_for_ids (or ..._for_id)
509 Usage : $hiv_query->remove_annotations_for_ids( @ids)
510 Function: Remove annotation collection for this sequence id
512 Returns : An array of the previous annotation collections for these ids
513 Args : an array of sequence ids
517 sub remove_annotations_for_ids
{
522 push @ac, delete $self->{'_annotations'}->{$_};
528 sub remove_annotations_for_id
{
529 shift->remove_annotations_for_ids(@_);
532 =head2 remove_annotations
534 Title : remove_annotations
535 Usage : $hiv_query->remove_annotations()
536 Function: Remove all annotation collections for this object
538 Returns : The previous annotation collection hash for this object
543 sub remove_annotations
{
546 my $ach = $self->{'_annotations'};
547 $self->{'_annotations'} = {};
554 Usage : $ac->get_value($tagname) -or-
555 $ac->get_value( $tag_level1, $tag_level2,... )
556 Function: access the annotation value assocated with the given tags
559 Args : an array of tagnames that descend into the annotation tree
560 Note : this is a L<Bio::AnnotationCollectionI> method added in
561 L<Bio::DB::HIV::HIVQueryHelper>
568 Usage : $ac->put_value($tagname, $value) -or-
569 $ac->put_value([$tag_level1, $tag_level2, ...], $value) -or-
570 $ac->put_value( [$tag_level1, $tag_level2, ...] )
571 Function: create a node in an annotation tree, and assign a scalar value to it
572 if a value is specified
574 Returns : scalar or a Bio::AnnotationCollection object
575 Args : $tagname, $value scalars (can be specified as -KEYS=>$tagname,
577 \@tagnames, $value (or as -KEYS=>\@tagnames, -VALUE=>$value )
578 Notes : This is a L<Bio::AnnotationCollectionI> method added in
579 L<Bio::DB::HIV::HIVQueryHelper>.
580 If intervening nodes do not exist, put_value creates them, replacing
581 existing nodes. So if $ac->put_value('x', 10) was done, then later,
582 $ac->put_value(['x', 'y'], 20), the original value of 'x' is trashed,
583 and $ac->get_value('x') will now return the annotation collection
588 =head1 GenBank accession manipulation methods
590 =head2 get_accessions
592 Title : get_accessions
593 Usage : $hiv_query->get_accessions()
594 Function: Return an array of GenBank accessions associated with these
595 sequences (available only after a query is subjected to a
596 full run (i.e., when $RUN_OPTION == 2)
598 Returns : array of gb accession numbers, or () if none found for this query
606 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
607 $self->warn('Requires query run at level 2');
610 my @ac = $self->get_annotations_by_ids($self->ids);
612 push @ret, $_->get_value('accession');
617 =head2 get_accessions_by_ids
619 Title : get_accessions_by_ids (or ..._by_id)
620 Usage : $hiv_query->get_accessions_by_ids(@ids)
621 Function: Return an array of GenBank accessions associated with these
622 LANL ids (available only after a query is subjected to a
623 full run (i.e., when $RUN_OPTION == 2)
625 Returns : array of gb accession numbers, or () if none found for this query
630 sub get_accessions_by_ids
{
634 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
635 $self->warn('Requires query run at level 2');
638 my @ac = $self->get_annotations_by_ids(@ids);
640 push @ret, $_->get_value('accession');
642 return wantarray ?
@ret : $ret[0];
646 sub get_accessions_by_id
{
647 shift->get_accessions_by_ids(@_);
652 =head1 Query control methods
657 Usage : $hiv_query->_do_query or $hiv_query->_do_query($run_level)
658 Function: Execute the query according to argument or $RUN_OPTION
660 extent of query reflects the value of argument
661 0 : validate only (no HTTP action)
662 1 : return sequence count only
663 2 : return sequence ids (full query, returns with annotations)
664 noop if current _RUN_LEVEL of query is >= argument or $RUN_OPTION,
666 Returns : actual _RUN_LEVEL (0, 1, or 2) achieved
667 Args : desired run level (optional, global $RUN_OPTION is default)
673 $rl = $RUN_OPTION unless defined $rl;
674 $self->throw(-class=>"Bio::Root::BadParameter",
675 -text
=>"Invalid run option \"$RUN_OPTION\"",
676 -value
=>$RUN_OPTION) unless grep /^$RUN_OPTION$/, (0, 1, 2);
677 (!defined($self->{'_RUN_LEVEL'})) && do {
678 $self->_create_lanl_query();
679 $self->{'_RUN_LEVEL'} = 0;
681 ($rl > 0) && (!defined($self->{'_RUN_LEVEL'}) || ($self->{'_RUN_LEVEL'} <= 0)) && do {
682 $self->_do_lanl_request();
683 $self->{'_RUN_LEVEL'} = 1;
685 ($rl > 1) && (!defined($self->{'_RUN_LEVEL'}) || ($self->{'_RUN_LEVEL'} <= 1)) && do {
686 $self->_parse_lanl_response();
687 $self->{'_RUN_LEVEL'} = 2;
689 return $self->{'_RUN_LEVEL'};
695 Usage : $hiv_query->_reset
696 Function: Resets query storage, count, and ids, while retaining session id,
697 original query string, and db schema
708 $self->{'_annotations'} = {};
709 $self->{'_lanl_response'} = [];
710 $self->{'_lanl_query'} = [];
711 $self->{'_RUN_LEVEL'} = undef;
718 Usage : $hiv_query->_session_id($newval)
719 Function: Get/set HIV db session id (initialized in _do_lanl_request)
721 Returns : value of _session_id (a scalar)
722 Args : on set, new value (a scalar or undef, optional)
729 return $self->{'_session_id'} = shift if @_;
730 return $self->{'_session_id'};
736 Usage : $hiv_query->_run_option($newval)
737 Function: Get/set HIV db query run option (see _do_query for values)
739 Returns : value of _run_option (a scalar)
740 Args : on set, new value (a scalar or undef, optional)
747 return $self->{'_run_option'} = shift if @_;
748 return $self->{'_run_option'};
758 Usage : $hiv_query->add_id($id)
759 Function: Add new id to ids
769 ${$self->{'ids'}}{$id}++;
776 return $self->{'lanl_base'} = shift if @_;
777 return $self->{'lanl_base'};
783 Usage : $obj->map_db($newval)
786 Returns : value of map_db (a scalar)
787 Args : on set, new value (a scalar or undef, optional)
793 return $self->{'map_db'} = shift if @_;
794 return $self->{'map_db'};
797 =head2 make_search_if
799 Title : make_search_if
800 Usage : $obj->make_search_if($newval)
803 Returns : value of make_search_if (a scalar)
804 Args : on set, new value (a scalar or undef, optional)
810 return $self->{'make_search_if'} = shift if @_;
811 return $self->{'make_search_if'};
817 Usage : $obj->search_($newval)
820 Returns : value of search_ (a scalar)
821 Args : on set, new value (a scalar or undef, optional)
827 return $self->{'search_'} = shift if @_;
828 return $self->{'search_'};
835 Function: return the full map_db uri ("Database Map")
837 Returns : scalar string
844 return $self->lanl_base."/".$self->map_db;
848 =head2 _make_search_if_uri
850 Title : _make_search_if_uri
852 Function: return the full make_search_if uri ("Make Search Interface")
854 Returns : scalar string
859 sub _make_search_if_uri
{
861 return $self->lanl_base."/".$self->make_search_if;
868 Function: return the full search cgi uri ("Search Database")
870 Returns : scalar string
877 return $self->lanl_base."/".$self->search_;
883 Usage : $hiv_query->_schema_file($newval)
886 Returns : value of _schema_file (an XML string or filename)
887 Args : on set, new value (an XML string or filename, or undef, optional)
894 return $self->{'_schema_file'} = shift if @_;
895 return $self->{'_schema_file'};
901 Usage : $hiv_query->_schema($newVal)
904 Returns : value of _schema (an HIVSchema object in package
905 L<Bio::DB::HIV::HIVQueryHelper>)
906 Args : none (field set directly in new())
914 return $self->{'_schema'} :
915 $self->throw(-class=>'Bio::SchemaNotInit::Exception',
916 -text
=>"DB schema not initialized",
924 Usage : $hiv_query->_lanl_query(\@query_parms)
925 Function: pushes \@query_parms onto @{$self->{'_lanl_query'}
927 Returns : value of _lanl_query (an arrayref)
928 Args : on set, new value (an arrayref or undef, optional)
935 return $self->{'_lanl_query'} unless $a;
936 if (ref $a eq 'ARRAY') {
937 push @
{$self->{'_lanl_query'}}, $a;
941 $self->throw(-class=>'Bio::Root::BadParameter',
942 -text
=>'Array ref required for argument.',
948 =head2 _lanl_response
950 Title : _lanl_response
951 Usage : $hiv_query->_lanl_response($response)
952 Function: pushes $response onto @{$hiv_query->{'_lanl_response'}}
954 Returns : value of _lanl_response (an arrayref of HTTP::Response objects)
955 Args : on set, new value (an HTTP::Response object or undef, optional)
963 $self->throw(-class=>'Bio::Root::BadParameter',
964 -text
=>'Requires an HTTP::Response object',
965 -value
=> ref $r) unless ref($r) eq 'HTTP::Response';
966 push @
{$self->{'_lanl_response'}}, $r;
969 return $self->{'_lanl_response'};
972 =head2 _create_lanl_query
974 Title : _create_lanl_query
975 Usage : $hiv_query->_create_lanl_query()
976 Function: validate query hash or string, prepare for _do_lanl_request
978 Returns : 1 if successful; throws exception on invalid query
983 sub _create_lanl_query
{
985 my (%inhash, @query, @qhashes);
986 my ($schema, @validFields, @validAliases);
990 $self->throw(-class=>'Bio::Root::NoSuchThing',
991 -text
=>'Query not specified',
995 ref eq 'HASH' && do {
997 if ( grep /HASH/, map {ref} values %inhash ) {
998 # check for {query=>{},annot=>[]} style
999 $self->throw(-class=>'Bio::Root::BadParameter',
1000 -text
=>'Query style unrecognized',
1001 -value
=>"") unless defined $inhash{query
};
1006 ref eq 'ARRAY' && do {
1007 $inhash{'query'} = {@
$_};
1008 push @qhashes, \
%inhash;
1013 @qhashes = $self->_parse_query_string($_);
1016 $schema = $self->_schema;
1017 @validFields = $schema->fields;
1018 @validAliases = $schema->aliases;
1020 # validate args based on the xml specification file
1021 # only checks blanks and fields with explicitly specified options
1022 # text fields can put anything, and the query will be run before
1023 # an error is caught in these
1024 foreach my $qh (@qhashes) {
1025 foreach my $k (keys %{$$qh{'query'}}) {
1028 if (grep /^$k$/, @validFields) {
1031 elsif (grep /^$k$/, @validAliases) {
1032 foreach (@validFields) {
1033 if (grep (/^$k$/, $schema->aliases($_))) {
1037 # $fld contains the field corresp. to the alias
1041 $self->throw(-class=>'Bio::Root::BadParameter',
1042 -text
=>"Invalid field or alias \"$k\"",
1045 # validate matchdata
1046 my $vf = $schema->_sfieldh($fld);
1047 my @md = (ref($qh->{'query'}{$k}) eq 'ARRAY') ? @
{$qh->{'query'}{$k}} : $qh->{'query'}{$k};
1048 if ($$vf{type
} eq 'text') {
1050 $self->throw(-class=>'Bio::Root::BadParameter',
1051 -text
=>'Value for field \"$k\" cannot be empty',
1053 if ($_ eq "") && ($$vf{blank_ok
} eq 'false');
1056 elsif ($$vf{type
} eq 'option') {
1057 foreach my $md (@md) {
1058 $self->throw(-class=>'Bio::Root::BadParameter',
1059 -text
=>"Invalid value \"".$md."\" for field \"$fld\"",
1061 unless $$vf{option
} && grep {defined $_ && /^$md$/} @
{$$vf{option
}};
1064 # validated; add to query
1066 push @query, ($fld => $_);
1069 if ($qh->{'annot'}) {
1070 # validate the column names to be included in the query
1071 # to obtain annotations
1072 my @annot_cols = @
{$qh->{'annot'}};
1073 foreach my $k (@annot_cols) {
1076 if (grep /^$k$/, @validFields) {
1079 elsif (grep /^$k$/, @validAliases) {
1080 foreach (@validFields) {
1081 if (grep (/^$k$/, $schema->aliases($_))) {
1085 # $fld should contain the field corresp. to the alias
1089 $self->throw(-class=>'Bio::Root::NoSuchThing',
1090 -text
=>"Invalid field or alias \"$k\"",
1093 # lazy: 'Any' may not be the right default (but appears to
1094 # be, based on the lanl html)
1095 push @query, ($fld => 'Any');
1099 # insure that LANL and GenBank ids are retrieved
1100 push @query, ('SequenceEntry.SE_id' => 'Any') unless grep /SequenceEntry\.SE_id/, @query;
1101 push @query, ('SequenceAccessions.SA_GenBankAccession' => 'Any')
1102 unless grep /SequenceAccessions\.SA_GenBankAccession/, @query;
1104 # an "order" field is required by the LANL CGI
1105 # if not specified, default to SE_id
1107 push @query, ('order'=>'SequenceEntry.SE_id') unless grep /order/, @query;
1109 # @query now contains sfield=>matchdata pairs, as specified by user
1110 # include appropriate indexes to create correct automatic joins
1111 # established by the LANL CGI
1112 my (@qtbl, @qpk, @qfk);
1114 # the tables represented in query:
1115 my %q = @query; # squish the tables in the current query into hash keys
1116 @qtbl = $schema->tbl('-s', keys %q);
1119 # more than one table, see if they can be connected
1120 # get primary keys of query tables
1121 @qpk = $schema->pk(@qtbl);
1123 # we need to get each query table to join to
1126 # The schema is a graph with tables as nodes and
1127 # foreign keys<->primary keys as branches. To get a
1128 # join that works, need to include in the query
1129 # all branches along a path from SequenceEntry
1130 # to each query table.
1132 # find_join does it...
1134 my @k = $schema->find_join($_,'SequenceEntry');
1137 # squish the keys in @joink
1139 @j{@joink} = (1) x
@joink;
1141 # add the fields not currently in the query
1142 foreach (@qpk, @joink) {
1144 if (!grep(/^$fld$/,keys %q)) {
1145 # lazy: 'Any' may not be the right default (but appears to
1146 # be, based on the lanl html)
1147 push @query, ($_ => 'Any');
1153 # set object property
1154 $self->_lanl_query([@query]);
1159 # _do_lanl_request : post the queries created by _create_lanl_query
1161 # @args (or {@args}) should be unaliased Table.Column=>Matchdata
1162 # pairs (these will be used directly in the POSTs)
1164 =head2 _do_lanl_request
1166 Title : _do_lanl_request
1167 Usage : $hiv_query->_do_lanl_request()
1168 Function: Perform search request on _create_lanl_query-validated query
1170 Returns : 1 if successful
1175 sub _do_lanl_request
{
1177 my (@queries, @query, @interface,$interfGet,$searchGet,$response);
1178 my ($numseqs, $count);
1181 if (!$self->_lanl_query) {
1182 $self->throw(-class=>"Bio::Root::BadParameter",
1183 -text
=>"_lanl_query empty, run _create_lanl_request first",
1187 @queries = @
{$self->_lanl_query};
1191 ## search site specific CGI parms
1192 my @search_pms = ('action'=>'Search');
1193 my @searchif_pms = ('action'=>'Search Interface');
1194 my @download_pms = ('action Download.x'=>1, 'action Download.y'=>1);
1196 ## HTML-testing regexps
1197 my $tags_re = qr{(?:\s*<[^>]+>\s*)};
1198 my $session_id_re = qr{<input.*name="id".*value="([0-9a-f]+)"}m;
1199 my $search_form_re = qr{<form[^>]*action=".*/search.comp"};
1200 my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found};
1201 my $no_seqs_found_re = qr{Sorry.*no sequences found};
1202 my $too_many_re = qr{too many records: $tags_re*([0-9]+)};
1204 foreach my $q (@queries) {
1206 # default query control parameters
1210 translate
=>'FALSE' # nucleotides
1215 # pull out commands, designated by the COMMAND pseudo-table...
1216 my @commands = map { $query[$_] =~ s/^COMMAND\.// ?
@query[$_..$_+1] : () } (0..$#query-1);
1217 @query = map { $query[$_] =~ /^COMMAND/ ?
() : @query[2*$_..2*$_+1] } (0..($#query-1)/2);
1220 # set control parameters explicitly made in query
1221 foreach my $cp (keys %qctrl) {
1222 if (!grep( /^$cp$/, @query)) {
1223 push @query, ($cp, $qctrl{$cp});
1227 # note that @interface must be an array, since a single 'key' (the table)
1228 # can be associated with multiple 'values' (the columns) in the POST
1230 # squish fieldnames into hash keys
1232 @interface = grep {defined} map {my ($tbl,$col) = /^(.*)\.(.*)$/} keys %q;
1233 eval { # encapsulate communication errors here, defer biothrows...
1235 #mark the useragent should be setable from outside (so we can modify timeouts, etc)
1236 my $ua = new Bio
::WebAgent
(timeout
=> 90);
1237 my $idPing = $ua->get($self->_map_db_uri);
1238 $idPing->is_success || do {
1240 die "Connect failed";
1242 # get the session id
1243 if (!$self->_session_id) {
1244 ($self->{'_session_id'}) = ($idPing->content =~ /$session_id_re/);
1245 $self->_session_id || do {
1247 die "Session not established";
1251 # strange bug: if action=>'Search+Interface' below (note "+"),
1252 # the response to the search (in $searchGet) shows the correct
1253 # >number< of sequences found, but also an error "No sequences
1254 # match" and an SQL barf. Changing the "+" to a " " sets up the
1255 # interface to lead to the actual sequences being delivered as
1257 $interfGet = $ua->post($self->_make_search_if_uri, [@interface, @searchif_pms, id
=>$self->_session_id]);
1258 $interfGet->is_success || do {
1259 $response=$interfGet;
1260 die "Interface request failed";
1262 # see if a search form was returned...
1264 $interfGet->content =~ /$search_form_re/ || do {
1265 $response=$interfGet;
1266 die "Interface request failed";
1269 $searchGet = $ua->post($self->_search_uri, [@query, @commands, @search_pms, id
=>$self->_session_id]);
1270 $searchGet->is_success || do {
1271 $response = $searchGet;
1272 die "Search failed";
1274 $response = $searchGet;
1275 for ($searchGet->content) {
1276 /$no_seqs_found_re/ && do {
1277 die "No sequences found";
1280 /$too_many_re/ && do {
1281 die "Too many records ($1): must be <10000";
1284 /$seqs_found_re/ && do {
1291 die "Search failed (response not parsed)";
1294 $response = $ua->post($self->_search_uri, [@download_pms, id
=>$self->_session_id]);
1295 $response->is_success || die "Query failed";
1296 # $response->content is a tab-separated value table of sequences
1297 # and metadata, first line starts with \# and contains fieldnames
1299 $self->_lanl_response($response);
1300 # throw, if necessary
1302 ($@
!~ "No sequences found") && do {
1303 $self->throw(-class=>'Bio::WebError::Exception',
1310 $self->warn("No sequences found for this query") unless $count;
1311 $self->count($count);
1312 return 1; # made it.
1316 =head2 _parse_lanl_response
1318 Title : _parse_lanl_response
1319 Usage : $hiv_query->_parse_lanl_response()
1320 Function: Parse the tab-separated-value response obtained by _do_lanl_request
1321 for sequence ids, accessions, and annotations
1323 Returns : 1 if successful
1328 sub _parse_lanl_response
{
1330 ### handle parsing and merging multiple responses into the query object
1331 ### (ids and annotations)
1334 my ($seqGet) = (@_);
1335 my (@data, @cols, %antbl, %antype,%anxlt, @ankeys );
1337 my ($schema, @retseqs, %rec, $ac);
1339 'country' => 'isolation_country',
1340 'coreceptor' => 'second_receptor',
1341 'patient health' => 'health_status'
1344 $schema = $self->_schema;
1346 $self->_lanl_response ||
1347 $self->throw(-class=>"Bio::QueryNotMade::Exception",
1348 -text
=>"Query not yet performed; call _do_lanl_request()",
1350 foreach my $rsp (@
{$self->_lanl_response}) {
1351 @data = split("\r|\n", $rsp->content);
1352 $numseq += ( shift(@data) =~ /Number.*:\s([0-9]+)/ )[0];
1353 @cols = split(/\t/, shift @data);
1355 # mappings from column headings to annotation keys
1356 # squish into hash keys
1357 my %q = @
{ shift @
{$self->_lanl_query} };
1358 %antbl = $schema->ankh(keys %q);
1359 foreach (values %antbl) {
1360 $antype{$_->{ankey
}} = $_->{antype
};
1361 push @ankeys, $_->{ankey
};
1365 ### conversion kludge for specials
1366 $k = $specials{lc $k} if (grep /$k/i, keys %specials);
1369 ($k) = grep (/$k$/i, keys %antbl);
1371 $anxlt{$_} = $antbl{$k}->{ankey
};
1375 @rec{@cols} = split /\t/;
1377 $self->add_id($rec{'SE id'});
1378 $ac = $self->add_annotations_for_id($rec{'SE id'});
1380 # need to handle reference, comment, dblink annots
1383 my $k = $anxlt{$_}; # annot key
1385 my $t = $antype{$k}; # annot type
1386 my $d = $rec{$_}; # the data
1388 \$ac->put_value(-KEYS=>[\$t, \$k], -VALUE=>\$d);
1392 $ac->put_value('accession', $rec{Accession
});
1396 return 1; # made it.
1399 =head2 _parse_query_string
1401 Title : _parse_query_string
1402 Usage : $hiv_query->_parse_query_string($str)
1403 Function: Parses a query string using query language emulator QRY
1404 : in L<Bio::DB::Query::HIVQueryHelper>
1406 Returns : arrayref of hash structures suitable for passing to _create_lanl_query
1407 Args : a string scalar
1411 sub _parse_query_string
{
1413 my $qstring = shift;
1415 #syntax errors thrown in QRY (in HIVQueryHelper module)
1416 $ptree = QRY
::_parse_q
( $qstring );
1417 @ret = QRY
::_make_q
($ptree);
1426 Usage : $hiv_query->_sorry("-president=>Powell")
1427 Function: Throws an exception for unsupported option or parameter
1430 Args : scalar string
1437 $self->throw(-class=>"Bio::HIVSorry::Exception",
1438 -text
=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.",