1 # to do: support for comment, reference annotations
3 # $Id: HIVQuery.pm 232 2008-12-11 14:51:51Z maj $
5 # BioPerl module for Bio::DB::Query::LANLQuery
7 # Cared for by Mark A. Jensen <maj@fortinbras.us>
9 # Copyright Mark A. Jensen
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::DB::Query::HIVQuery - Query interface to the Los Alamos HIV Sequence Database
21 $q = new Bio::DB::Query::HIVQuery(" C[subtype] ZA[country] CXCR4[coreceptor] ");
22 $q = new Bio::DB::Query::HIVQuery(
23 -query=>{'subtype'=>'C',
25 'coreceptor'=>'CXCR4'});
27 $ac = $q->get_annotations_by_id(($q->ids)[0]);
28 $ac->get_value('Geo', 'country') # returns 'SOUTH AFRICA'
30 $db = new Bio::DB::HIV();
31 $seqio = $db->get_Stream_by_query($q); # returns annotated Bio::Seqs
33 # get subtype C sequences from South Africa and Brazil,
34 # with associated info on patient health, coreceptor use, and
37 $q = new Bio::DB::Query::HIVQuery(
39 'query' => {'subtype'=>'C',
40 'country'=>['ZA', 'BR']},
41 'annot' => ['patient_health',
43 'days_post_infection']
49 Bio::DB::Query::HIVQuery provides a query-like interface to the
50 cgi-based Los Alamos National Laboratory (LANL) HIV Sequence
51 Database. It uses Bioperl facilities to capture both sequences and
52 annotations in batch in an automated and computable way. Use with
53 L<Bio::DB::HIV> to create C<Bio::Seq> objects and annotated C<Bio::SeqIO>
58 The interface implements a simple query language emulation that understands AND,
59 OR, and parenthetical nesting. The basic query unit is
61 (match1 match2 ...)[fieldname]
63 Sequences are returned for which C<fieldname> equals C<match1 OR match2 OR ...>.
64 These units can be combined with AND, OR and parentheses. For example:
66 (B, C)[subtype] AND (2000, 2001, 2002, 2003)[year] AND ((CN)[country] OR (ZA)[country])
68 which can be shortened to
70 (B C)[subtype] (2000 2001 2002 2003)[year] (CN ZA)[country]
72 The user can specify annotation fields, that do not restrict the query, but
73 arrange for the return of the associated field data for each sequence returned.
74 Specify annotation fields between curly braces, as in:
76 (B C)[subtype] 2000[year] {country cd4_count cd8_count}
78 Annotations can be accessed off the query using methods described in APPENDIX.
80 =head2 Hash specifications for query construction
82 Single query specifications can be made as hash references provided to the
83 C<-query> argument of the constructor. There are two forms:
85 -query => { 'country'=>'BR', 'phenotype'=>'NSI', 'cd4_count'=>'Any' }
89 -query => [ 'country'=>'BR', 'phenotype'=>'NSI', 'cd4_count'=>'Any' ]
93 -query => { 'query' => {'country'=>'BR', 'phenotype'=>'NSI'},
94 'annot' => ['cd4_count'] }
96 In both cases, the CD4 count is included in the annotations returned, but does
97 not restrict the rest of the query.
99 To 'OR' multiple values of a field, use an anonymous array ref:
101 -query => { 'country'=>['ZA','BR','NL'], 'subtype'=>['A', 'C', 'D'] }
103 =head2 Valid query field names
105 An attempt was made to make the query field names natural and easy to
106 remember. Aliases are specified in an XML file (C<lanl-schema.xml>) that is part
107 of the distribution. Custom field aliases can be set up by modifying this file.
109 An HTML cheatsheet with valid field names, aliases, and match data can
110 be generated from the XML by using
111 C<hiv_object-E<gt>help('help.html')>. A query can also be validated
112 locally before it is unleashed on the server; see below.
116 LANL DB annotations have been organized into a number of natural
117 groupings, tagged C<Geo>, C<Patient>, C<Virus>, and <StdMap>. After a
118 successful query, each id is associated with a tree of
119 L<Bio::Annotation::SimpleValue> objects. These can be accessed with
120 methods C<get_value()> and C<put_value()> described in APPENDIX.
122 =head2 Delayed/partial query runs
124 Accessing the LANL DB involves multiple HTTP requests. The query can
125 be instructed to proceed through all (the default) or only some of
126 them, using the named parameter C<RUN_OPTION>.
128 To validate a query locally, use
130 $q = new Bio::DB::Query::HIVQuery( -query => {...}, -RUN_OPTION=>0 )
132 which will throw an exception if a field name or option is invalid.
134 To get a query count only, you can save a server hit by using
136 $q = new Bio::DB::Query::HIVQuery( -query => {...}, -RUN_OPTION=>1 )
138 and asking for C<$q-E<gt>count>. To finish the query, do
142 which picks up where you left off.
144 C<-RUN_OPTION=E<gt>2>, the default, runs the full query, returning ids and
149 You can clear the query results, retaining the same LANL session and query spec,
150 by doing C<$q-E<gt>_reset>. Change the query, and rerun with
151 C<$q-E<gt>_do_query($YOUR_RUN_OPTION)>.
157 User feedback is an integral part of the evolution of this and other
158 Bioperl modules. Send your comments and suggestions preferably to
159 the Bioperl mailing list. Your participation is much appreciated.
161 bioperl-l@bioperl.org - General discussion
162 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
164 =head2 Reporting Bugs
166 Report bugs to the Bioperl bug tracking system to help us keep track
167 of the bugs and their resolution. Bug reports can be submitted via
170 http://bugzilla.open-bio.org/
172 =head1 AUTHOR - Mark A. Jensen
174 Email maj@fortinbras.us
180 The rest of the documentation details each of the object methods.
181 Internal methods are usually preceded with a _
185 # Let the code begin...
187 package Bio
::DB
::Query
::HIVQuery
;
189 use vars
qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH $SCHEMA_FILE $RUN_OPTION );
191 # Object preamble - inherits from Bio::DB::QueryI
193 use Bio::Annotation::Collection;
194 use Bio::Annotation::Comment;
195 use Bio::Annotation::Reference;
200 use Bio::DB::HIV::HIVQueryHelper;
202 use base qw(Bio::Root::Root Bio::DB::QueryI);
206 # change base to new search page 01/14/09 /maj
207 $LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/asearch";
208 $LANL_MAP_DB = "map_db.comp";
209 $LANL_MAKE_SEARCH_IF = "make_search_if.comp";
210 $LANL_SEARCH = "search.comp";
211 $SCHEMA_FILE = Bio
::Root
::IO
->catfile(qw(Bio DB HIV lanl-schema.xml));
212 $RUN_OPTION = 2; # execute query
214 @Bio::SchemaNotInit
::Exception
::ISA
= qw( Bio::Root::Exception );
215 @Bio::WebError
::Exception
::ISA
= qw( Bio::Root::Exception );
216 @Bio::QueryNotMade
::Exception
::ISA
= qw( Bio::Root::Exception );
217 @Bio::QueryStringException
::Exception
::ISA
= qw( Bio::Root::Exception );
218 @Bio::HIVSorry
::Exception
::ISA
= qw
( Bio
::Root
::Exception
);
227 Usage : my $hiv_query = new Bio::DB::Query::HIVQuery();
228 Function: Builds a new Bio::DB::Query::HIVQuery object,
229 running a sequence query against the Los Alamos
230 HIV sequence database
231 Returns : an instance of Bio::DB::Query::HIVQuery
237 my($class,@args) = @_;
238 my $self = $class->SUPER::new
(@args);
239 # constructor option for web agent parameter spec: added 01/14/09 /maj
240 my ($query, $ids, $lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search, $schema_file,$run_option, $uahash) =
241 $self->_rearrange([ qw(QUERY
253 $lanl_base||= $LANL_BASE;
254 $lanl_map_db||=$LANL_MAP_DB;
255 $lanl_make_search_if||=$LANL_MAKE_SEARCH_IF;
256 $lanl_search||=$LANL_SEARCH;
257 $schema_file||=$SCHEMA_FILE;
258 $uahash ||= {timeout
=> 90};
259 defined $run_option || ($run_option = $RUN_OPTION);
261 $self->lanl_base($lanl_base);
262 $self->map_db($lanl_map_db);
263 $self->make_search_if($lanl_make_search_if);
264 $self->search_($lanl_search);
265 $self->_run_option($run_option);
266 $self->_ua_hash($uahash);
268 # catch this at the top
269 if (-e
$schema_file) {
270 $self->_schema_file($schema_file);
273 my ($p) = $self->_schema_file( [grep {$_} map {
274 my $p = Bio
::Root
::IO
->catfile($_, $schema_file);
277 $self->throw(-class=>"Bio::Root::NoSuchThing",
278 -text
=>"Schema file \"".$self->_schema_file."\" cannot be found",
279 -value
=>$self->_schema_file) unless -e
$self->_schema_file;
283 $self->{_schema
} = HIVSchema
->new($self->_schema_file);
285 # internal storage and flags
286 $self->{'_lanl_query'} = [];
287 $self->{'_lanl_response'} = [];
288 $self->{'_annotations'} = {}; # container for annotation collections assoc. with ids
289 $self->{'_RUN_LEVEL'} = undef; # set in _do_query()
292 defined $query && $self->query($query);
293 defined $ids && $self->ids($ids);
297 $self->_do_query($self->_run_option) if $self->query;
302 =head1 QueryI compliance
307 Usage : $hiv_query->count($newval)
308 Function: return number of sequences found
310 Returns : value of count (a scalar)
311 Args : on set, new value (a scalar or undef, optional)
312 Note : count warns if it is accessed for reading before query
313 has been executed to at least level 1
319 return $self->{'count'} = shift if @_;
320 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 1)) {
321 $self->warn('Query not yet run at > level 1');
323 return $self->{'count'};
329 Usage : $hiv_query->ids($newval)
330 Function: LANL ids of returned sequences
332 Returns : value of ids (an arrayref of sequence accessions/ids)
333 Args : on set, new value (an arrayref or undef, optional)
341 $self->throw(-class=>'Bio::Root::BadParameter',
342 -text
=>'Arrayref required',
343 -value
=> ref $a) unless ref($a) eq 'ARRAY';
345 @
{$self->{'ids'}}{@
$a} = (1) x @
$a;
348 else { #with empty arrayref, clear the hash
352 return keys %{$self->{'ids'}} if $self->{'ids'};
358 Usage : $hiv_query->query
359 Function: Get/set the submitted query hash or string
361 Returns : hashref or string
362 Args : query in hash or string form (see DESCRIPTION)
368 return $self->{'query'} = shift if @_;
369 return $self->{'query'};
372 =head1 Bio::DB::Query::HIVQuery specific methods
377 Usage : $hiv_query->help("help.html")
378 Function: get html-formatted listing of valid fields/aliases/options
379 based on current schema xml
380 Example : perl -MBio::DB::Query::HIVQuery -e "new Bio::DB::Query::HIVQuery()->help" | lynx -stdin
382 Args : optional filename; otherwise prints to stdout
387 my ($self, $fname) = @_;
389 my $schema = $self->_schema;
392 my (@tbls, @flds, @als, @opts, $fh);
394 open ($fh, ">", $fname) or $self->throw(-class=>'Bio::Root::IOException',
395 -text
=>"Error opening help html file $fname for writing",
401 @tbls = $schema->tables;
402 @tbls = ('COMMAND', grep !/COMMAND/,@tbls);
404 $h->start_html(-title
=>"HIVQuery Help")
406 print $fh $h->a({-id
=>'TOP'}, $h->h2("Valid <span style='font-variant:small-caps'>HIVQuery</span> query fields and match data"));
407 print $fh "Fields are organized below according to their Los Alamos HIV database tables. Use aliases in place of full field names in queries; for example:<br/>";
408 print $fh "<blockquote><code> (CCR5 CXCR4)[coreceptor]</code></blockquote>";
409 print $fh "rather than";
410 print $fh "<blockquote><code>(CCR5 CXCR4)[seq_sample.ssam_second_receptor] </code></blockquote>";
411 print $fh "(which does work, however). Click hyperlinks to see valid search options within the field. The token <code><b>Any</b></code> is the wildcard for all fields.<br/><br/>";
412 print $fh $h->start_table({-style
=>"font-family:sans-serif;"}) ;
413 foreach my $tbl (@tbls) {
414 @flds = grep /^$tbl/, $schema->fields;
415 @flds = grep !/_id/, @flds;
417 $h->start_Tr({-style
=>"background-color: lightblue;"}),
418 $h->td([$h->a({-id
=>$tbl},$tbl), $h->span({-style
=>"font-style:italic"},"fields"), $h->span({-style
=>"font-style:italic"}, "aliases")]),
421 foreach my $fld (@flds) {
422 @als = reverse $schema->aliases($fld);
424 $h->Tr( $h->td( ["", $h->a({-href
=>"#opt$fld"}, shift @als), $h->code(join(',',@als))] ))
426 my @tmp = grep {$_} $schema->options($fld);
427 @tmp = sort {(($a =~ /^[0-9]+$/) && $b =~ /^[0-9]+$/) ?
$a<=>$b : $a cmp $b} @tmp;
428 if (grep /Any/,@tmp) {
429 @tmp = grep !/Any/, @tmp;
432 #print STDERR join(', ',@tmp)."\n";
434 {-style
=>"font-family:sans-serif;font-size:small"},
438 "<i>Valid options for</i> <b>$fld</b>: "),
440 @tmp ?
$h->code(join(", ", @tmp)) : $h->i("free text")
443 "<i>Other aliases</i>: "),
445 @als ?
$h->code(join(",",@als)) : "<i>none</i>"
450 $h->a({-href
=>"#$tbl"}, $h->small('BACK')),
451 $h->a({-href
=>"#TOP"}, $h->small('TOP'))
457 print $fh $h->end_table;
459 print $fh $h->end_html;
464 =head1 Annotation manipulation methods
466 =head2 get_annotations_by_ids
468 Title : get_annotations_by_ids (or ..._by_id)
469 Usage : $ac = $hiv_query->get_annotations_by_ids(@ids)
470 Function: Get the Bio::Annotation::Collection for these sequence ids
472 Returns : A Bio::Annotation::Collection object
473 Args : an array of sequence ids
477 sub get_annotations_by_ids
{
481 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
482 $self->warn('Requires query run at level 2');
485 @ret = map {$self->{'_annotations'}->{$_}} @ids if exists($self->{'_annotations'});
487 return (wantarray ?
@ret : $ret[0]) if @ret;
492 sub get_annotations_by_id
{
493 shift->get_annotations_by_ids(@_);
496 =head2 add_annotations_for_id
498 Title : add_annotations_for_id
499 Usage : $hiv_query->add_annotations_for_id( $id ) to create a new
500 empty collection for $id
501 $hiv_query->add_annotations_for_id( $id, $ac ) to associate
503 Function: Associate a Bio::Annotation::Collection with this sequence id
505 Returns : a Bio::Annotation::Collection object
506 Args : sequence id [, Bio::Annotation::Collection object]
510 sub add_annotations_for_id
{
513 $ac = new Bio
::Annotation
::Collection
unless defined $ac;
514 $self->throw(-class=>'Bio::Root::BadParameter'
515 -text
=>'Bio::Annotation::Collection required at arg 2',
516 -value
=>"") unless ref($ac) eq 'Bio::Annotation::Collection';
518 $self->{'_annotations'}->{$id} = $ac unless exists($self->{'_annotations'}->{$id});
522 =head2 remove_annotations_for_ids
524 Title : remove_annotations_for_ids (or ..._for_id)
525 Usage : $hiv_query->remove_annotations_for_ids( @ids)
526 Function: Remove annotation collection for this sequence id
528 Returns : An array of the previous annotation collections for these ids
529 Args : an array of sequence ids
533 sub remove_annotations_for_ids
{
538 push @ac, delete $self->{'_annotations'}->{$_};
544 sub remove_annotations_for_id
{
545 shift->remove_annotations_for_ids(@_);
548 =head2 remove_annotations
550 Title : remove_annotations
551 Usage : $hiv_query->remove_annotations()
552 Function: Remove all annotation collections for this object
554 Returns : The previous annotation collection hash for this object
559 sub remove_annotations
{
562 my $ach = $self->{'_annotations'};
563 $self->{'_annotations'} = {};
570 Usage : $ac->get_value($tagname) -or-
571 $ac->get_value( $tag_level1, $tag_level2,... )
572 Function: access the annotation value assocated with the given tags
575 Args : an array of tagnames that descend into the annotation tree
576 Note : this is a L<Bio::AnnotationCollectionI> method added in
577 L<Bio::DB::HIV::HIVQueryHelper>
584 Usage : $ac->put_value($tagname, $value) -or-
585 $ac->put_value([$tag_level1, $tag_level2, ...], $value) -or-
586 $ac->put_value( [$tag_level1, $tag_level2, ...] )
587 Function: create a node in an annotation tree, and assign a scalar value to it
588 if a value is specified
590 Returns : scalar or a Bio::AnnotationCollection object
591 Args : $tagname, $value scalars (can be specified as -KEYS=>$tagname,
593 \@tagnames, $value (or as -KEYS=>\@tagnames, -VALUE=>$value )
594 Notes : This is a L<Bio::AnnotationCollectionI> method added in
595 L<Bio::DB::HIV::HIVQueryHelper>.
596 If intervening nodes do not exist, put_value creates them, replacing
597 existing nodes. So if $ac->put_value('x', 10) was done, then later,
598 $ac->put_value(['x', 'y'], 20), the original value of 'x' is trashed,
599 and $ac->get_value('x') will now return the annotation collection
604 =head1 GenBank accession manipulation methods
606 =head2 get_accessions
608 Title : get_accessions
609 Usage : $hiv_query->get_accessions()
610 Function: Return an array of GenBank accessions associated with these
611 sequences (available only after a query is subjected to a
612 full run (i.e., when $RUN_OPTION == 2)
614 Returns : array of gb accession numbers, or () if none found for this query
622 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
623 $self->warn('Requires query run at level 2');
626 my @ac = $self->get_annotations_by_ids($self->ids);
628 push @ret, $_->get_value('accession');
633 =head2 get_accessions_by_ids
635 Title : get_accessions_by_ids (or ..._by_id)
636 Usage : $hiv_query->get_accessions_by_ids(@ids)
637 Function: Return an array of GenBank accessions associated with these
638 LANL ids (available only after a query is subjected to a
639 full run (i.e., when $RUN_OPTION == 2)
641 Returns : array of gb accession numbers, or () if none found for this query
646 sub get_accessions_by_ids
{
650 if (!$self->{'_RUN_LEVEL'} || ($self->{'_RUN_LEVEL'} < 2)) {
651 $self->warn('Requires query run at level 2');
654 my @ac = $self->get_annotations_by_ids(@ids);
656 push @ret, $_->get_value('accession');
658 return wantarray ?
@ret : $ret[0];
662 sub get_accessions_by_id
{
663 shift->get_accessions_by_ids(@_);
668 =head1 Query control methods
673 Usage : $hiv_query->_do_query or $hiv_query->_do_query($run_level)
674 Function: Execute the query according to argument or $RUN_OPTION
676 extent of query reflects the value of argument
677 0 : validate only (no HTTP action)
678 1 : return sequence count only
679 2 : return sequence ids (full query, returns with annotations)
680 noop if current _RUN_LEVEL of query is >= argument or $RUN_OPTION,
682 Returns : actual _RUN_LEVEL (0, 1, or 2) achieved
683 Args : desired run level (optional, global $RUN_OPTION is default)
689 $rl = $RUN_OPTION unless defined $rl;
690 $self->throw(-class=>"Bio::Root::BadParameter",
691 -text
=>"Invalid run option \"$RUN_OPTION\"",
692 -value
=>$RUN_OPTION) unless grep /^$RUN_OPTION$/, (0, 1, 2);
693 (!defined($self->{'_RUN_LEVEL'})) && do {
694 $self->_create_lanl_query();
695 $self->{'_RUN_LEVEL'} = 0;
697 ($rl > 0) && (!defined($self->{'_RUN_LEVEL'}) || ($self->{'_RUN_LEVEL'} <= 0)) && do {
698 $self->_do_lanl_request();
699 $self->{'_RUN_LEVEL'} = 1;
701 ($rl > 1) && (!defined($self->{'_RUN_LEVEL'}) || ($self->{'_RUN_LEVEL'} <= 1)) && do {
702 $self->_parse_lanl_response();
703 $self->{'_RUN_LEVEL'} = 2;
705 return $self->{'_RUN_LEVEL'};
711 Usage : $hiv_query->_reset
712 Function: Resets query storage, count, and ids, while retaining session id,
713 original query string, and db schema
724 $self->{'_annotations'} = {};
725 $self->{'_lanl_response'} = [];
726 $self->{'_lanl_query'} = [];
727 $self->{'_RUN_LEVEL'} = undef;
734 Usage : $hiv_query->_session_id($newval)
735 Function: Get/set HIV db session id (initialized in _do_lanl_request)
737 Returns : value of _session_id (a scalar)
738 Args : on set, new value (a scalar or undef, optional)
745 return $self->{'_session_id'} = shift if @_;
746 return $self->{'_session_id'};
752 Usage : $hiv_query->_run_option($newval)
753 Function: Get/set HIV db query run option (see _do_query for values)
755 Returns : value of _run_option (a scalar)
756 Args : on set, new value (a scalar or undef, optional)
763 return $self->{'_run_option'} = shift if @_;
764 return $self->{'_run_option'};
770 Usage : $obj->_ua_hash($newval)
773 Returns : value of _ua_hash (a scalar)
774 Args : on set, new value (a scalar or undef, optional)
783 $self->{'_ua_hash'} = $_[0];
787 $self->{'_ua_hash'} = {@_};
791 $self->throw("Type ".ref($_)." unsupported as arg in _ua_hash");
796 return %{$self->{'_ua_hash'}};
807 Usage : $hiv_query->add_id($id)
808 Function: Add new id to ids
818 ${$self->{'ids'}}{$id}++;
825 return $self->{'lanl_base'} = shift if @_;
826 return $self->{'lanl_base'};
832 Usage : $obj->map_db($newval)
835 Returns : value of map_db (a scalar)
836 Args : on set, new value (a scalar or undef, optional)
842 return $self->{'map_db'} = shift if @_;
843 return $self->{'map_db'};
846 =head2 make_search_if
848 Title : make_search_if
849 Usage : $obj->make_search_if($newval)
852 Returns : value of make_search_if (a scalar)
853 Args : on set, new value (a scalar or undef, optional)
859 return $self->{'make_search_if'} = shift if @_;
860 return $self->{'make_search_if'};
866 Usage : $obj->search_($newval)
869 Returns : value of search_ (a scalar)
870 Args : on set, new value (a scalar or undef, optional)
876 return $self->{'search_'} = shift if @_;
877 return $self->{'search_'};
884 Function: return the full map_db uri ("Database Map")
886 Returns : scalar string
893 return $self->lanl_base."/".$self->map_db;
897 =head2 _make_search_if_uri
899 Title : _make_search_if_uri
901 Function: return the full make_search_if uri ("Make Search Interface")
903 Returns : scalar string
908 sub _make_search_if_uri
{
910 return $self->lanl_base."/".$self->make_search_if;
917 Function: return the full search cgi uri ("Search Database")
919 Returns : scalar string
926 return $self->lanl_base."/".$self->search_;
932 Usage : $hiv_query->_schema_file($newval)
935 Returns : value of _schema_file (an XML string or filename)
936 Args : on set, new value (an XML string or filename, or undef, optional)
943 return $self->{'_schema_file'} = shift if @_;
944 return $self->{'_schema_file'};
950 Usage : $hiv_query->_schema($newVal)
953 Returns : value of _schema (an HIVSchema object in package
954 L<Bio::DB::HIV::HIVQueryHelper>)
955 Args : none (field set directly in new())
963 return $self->{'_schema'} :
964 $self->throw(-class=>'Bio::SchemaNotInit::Exception',
965 -text
=>"DB schema not initialized",
973 Usage : $hiv_query->_lanl_query(\@query_parms)
974 Function: pushes \@query_parms onto @{$self->{'_lanl_query'}
976 Returns : value of _lanl_query (an arrayref)
977 Args : on set, new value (an arrayref or undef, optional)
984 return $self->{'_lanl_query'} unless $a;
985 if (ref $a eq 'ARRAY') {
986 push @
{$self->{'_lanl_query'}}, $a;
990 $self->throw(-class=>'Bio::Root::BadParameter',
991 -text
=>'Array ref required for argument.',
997 =head2 _lanl_response
999 Title : _lanl_response
1000 Usage : $hiv_query->_lanl_response($response)
1001 Function: pushes $response onto @{$hiv_query->{'_lanl_response'}}
1003 Returns : value of _lanl_response (an arrayref of HTTP::Response objects)
1004 Args : on set, new value (an HTTP::Response object or undef, optional)
1012 $self->throw(-class=>'Bio::Root::BadParameter',
1013 -text
=>'Requires an HTTP::Response object',
1014 -value
=> ref $r) unless ref($r) eq 'HTTP::Response';
1015 push @
{$self->{'_lanl_response'}}, $r;
1018 return $self->{'_lanl_response'};
1021 =head2 _create_lanl_query
1023 Title : _create_lanl_query
1024 Usage : $hiv_query->_create_lanl_query()
1025 Function: validate query hash or string, prepare for _do_lanl_request
1027 Returns : 1 if successful; throws exception on invalid query
1032 sub _create_lanl_query
{
1034 my (%inhash, @query, @qhashes);
1035 my ($schema, @validFields, @validAliases);
1037 for ($self->query) {
1039 $self->throw(-class=>'Bio::Root::NoSuchThing',
1040 -text
=>'Query not specified',
1044 ref eq 'HASH' && do {
1046 if ( grep /HASH/, map {ref} values %inhash ) {
1047 # check for {query=>{},annot=>[]} style
1048 $self->throw(-class=>'Bio::Root::BadParameter',
1049 -text
=>'Query style unrecognized',
1050 -value
=>"") unless defined $inhash{query
};
1055 ref eq 'ARRAY' && do {
1056 $inhash{'query'} = {@
$_};
1057 push @qhashes, \
%inhash;
1062 @qhashes = $self->_parse_query_string($_);
1065 $schema = $self->_schema;
1066 @validFields = $schema->fields;
1067 @validAliases = $schema->aliases;
1069 # validate args based on the xml specification file
1070 # only checks blanks and fields with explicitly specified options
1071 # text fields can put anything, and the query will be run before
1072 # an error is caught in these
1073 foreach my $qh (@qhashes) {
1075 foreach my $k (keys %{$$qh{'query'}}) {
1078 if (grep /^$k$/, @validFields) {
1081 elsif (grep /^$k$/, @validAliases) {
1082 foreach (@validFields) {
1083 if (grep (/^$k$/, $schema->aliases($_))) {
1087 # $fld contains the field corresp. to the alias
1091 $self->throw(-class=>'Bio::Root::BadParameter',
1092 -text
=>"Invalid field or alias \"$k\"",
1095 # validate matchdata
1096 my $vf = $schema->_sfieldh($fld);
1097 my @md = (ref($qh->{'query'}{$k}) eq 'ARRAY') ? @
{$qh->{'query'}{$k}} : $qh->{'query'}{$k};
1098 if ($$vf{type
} eq 'text') {
1100 $self->throw(-class=>'Bio::Root::BadParameter',
1101 -text
=>'Value for field \"$k\" cannot be empty',
1103 if ($_ eq "") && ($$vf{blank_ok
} eq 'false');
1106 elsif ($$vf{type
} eq 'option') {
1107 foreach my $md (@md) {
1108 $self->throw(-class=>'Bio::Root::BadParameter',
1109 -text
=>"Invalid value \"".$md."\" for field \"$fld\"",
1111 unless $$vf{option
} && grep {defined $_ && /^$md$/} @
{$$vf{option
}};
1114 # validated; add to query
1116 push @query, ($fld => $_);
1119 if ($qh->{'annot'}) {
1120 # validate the column names to be included in the query
1121 # to obtain annotations
1122 my @annot_cols = @
{$qh->{'annot'}};
1123 foreach my $k (@annot_cols) {
1126 if (grep /^$k$/, @validFields) {
1129 elsif (grep /^$k$/, @validAliases) {
1130 foreach (@validFields) {
1131 if (grep (/^$k$/, $schema->aliases($_))) {
1135 # $fld should contain the field corresp. to the alias
1139 $self->throw(-class=>'Bio::Root::NoSuchThing',
1140 -text
=>"Invalid field or alias \"$k\"",
1143 # lazy: 'Any' may not be the right default (but appears to
1144 # be, based on the lanl html)
1145 push @query, ($fld => 'Any');
1149 # insure that LANL and GenBank ids are retrieved
1150 push @query, ('sequenceentry.se_id' => 'Any') unless grep /SequenceEntry\.SE_id/, @query;
1151 push @query, ('sequenceaccessions.sa_genbankaccession' => 'Any')
1152 unless grep /SequenceAccessions\.SA_GenBankAccession/, @query;
1154 # an "order" field is required by the LANL CGI
1155 # if not specified, default to SE_id
1157 push @query, ('order'=>'sequenceentry.se_id') unless grep /order/, @query;
1159 # @query now contains sfield=>matchdata pairs, as specified by user
1160 # include appropriate indexes to create correct automatic joins
1161 # established by the LANL CGI
1162 my (@qtbl, @qpk, @qfk);
1164 # the tables represented in query:
1165 my %q = @query; # squish the tables in the current query into hash keys
1166 @qtbl = $schema->tbl('-s', keys %q);
1169 # more than one table, see if they can be connected
1170 # get primary keys of query tables
1171 @qpk = $schema->pk(@qtbl);
1173 # we need to get each query table to join to
1176 # The schema is a graph with tables as nodes and
1177 # foreign keys<->primary keys as branches. To get a
1178 # join that works, need to include in the query
1179 # all branches along a path from SequenceEntry
1180 # to each query table.
1182 # find_join does it...
1184 my @k = $schema->find_join($_,'sequenceentry');
1187 # squish the keys in @joink
1189 @j{@joink} = (1) x
@joink;
1191 # add the fields not currently in the query
1192 foreach (@qpk, @joink) {
1194 if (!grep(/^$fld$/,keys %q)) {
1195 # lazy: 'Any' may not be the right default (but appears to
1196 # be, based on the lanl html)
1197 push @query, ($_ => 'Any');
1203 # set object property
1204 $self->_lanl_query([@query]);
1209 # _do_lanl_request : post the queries created by _create_lanl_query
1211 # @args (or {@args}) should be unaliased Table.Column=>Matchdata
1212 # pairs (these will be used directly in the POSTs)
1214 =head2 _do_lanl_request
1216 Title : _do_lanl_request
1217 Usage : $hiv_query->_do_lanl_request()
1218 Function: Perform search request on _create_lanl_query-validated query
1220 Returns : 1 if successful
1225 sub _do_lanl_request
{
1227 my (@queries, @query, @interface,$interfGet,$searchGet,$response);
1228 my ($numseqs, $count);
1231 if (!$self->_lanl_query) {
1232 $self->throw(-class=>"Bio::Root::BadParameter",
1233 -text
=>"_lanl_query empty, run _create_lanl_request first",
1237 @queries = @
{$self->_lanl_query};
1241 ## search site specific CGI parms
1242 my @search_pms = ('action'=>'Search');
1243 my @searchif_pms = ('action'=>'Search Interface');
1244 # don't get the actual sequence data here (i.e., the cgi parm
1245 # 'incl_seq' remains undefined...
1246 my @download_pms = ('action Download.x'=>1, 'action Download.y'=>1);
1248 ## HTML-testing regexps
1249 my $tags_re = qr{(?:\s*<[^>]+>\s*)};
1250 my $session_id_re = qr{<input.*name="id".*value="([0-9a-f]+)"}m;
1251 my $search_form_re = qr{<form[^>]*action=".*/search.comp"};
1252 my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found};
1253 my $no_seqs_found_re = qr{Sorry.*no sequences found};
1254 my $too_many_re = qr{too many records: $tags_re*([0-9]+)};
1255 # find something like:
1256 # <strong>tables without join:</strong><br>SequenceAccessions<br>
1257 my $tbl_no_join_re = qr{tables without join}i;
1258 # my $sorry_bud_re = qr{};
1261 foreach my $q (@queries) {
1263 # default query control parameters
1267 translate
=>'FALSE' # nucleotides
1272 # pull out commands, designated by the COMMAND pseudo-table...
1273 my @commands = map { $query[$_] =~ s/^COMMAND\.// ?
@query[$_..$_+1] : () } (0..$#query-1);
1274 @query = map { $query[$_] =~ /^COMMAND/ ?
() : @query[2*$_..2*$_+1] } (0..($#query-1)/2);
1277 # set control parameters explicitly made in query
1278 foreach my $cp (keys %qctrl) {
1279 if (!grep( /^$cp$/, @query)) {
1280 push @query, ($cp, $qctrl{$cp});
1284 # note that @interface must be an array, since a single 'key' (the table)
1285 # can be associated with multiple 'values' (the columns) in the POST
1287 # squish fieldnames into hash keys
1289 @interface = grep {defined} map {my ($tbl,$col) = /^(.*)\.(.*)$/} keys %q;
1290 eval { # encapsulate communication errors here, defer biothrows...
1292 #mark the useragent should be setable from outside (so we can modify timeouts, etc)
1293 my $ua = new Bio
::WebAgent
($self->_ua_hash);
1294 my $idPing = $ua->get($self->_map_db_uri);
1295 $idPing->is_success || do {
1297 die "Connect failed";
1299 # get the session id
1300 if (!$self->_session_id) {
1301 ($self->{'_session_id'}) = ($idPing->content =~ /$session_id_re/);
1302 $self->_session_id || do {
1304 die "Session not established";
1308 # strange bug: if action=>'Search+Interface' below (note "+"),
1309 # the response to the search (in $searchGet) shows the correct
1310 # >number< of sequences found, but also an error "No sequences
1311 # match" and an SQL barf. Changing the "+" to a " " sets up the
1312 # interface to lead to the actual sequences being delivered as
1314 $interfGet = $ua->post($self->_make_search_if_uri, [@interface, @searchif_pms, id
=>$self->_session_id]);
1315 $interfGet->is_success || do {
1316 $response=$interfGet;
1317 die "Interface request failed";
1319 # see if a search form was returned...
1321 $interfGet->content =~ /$search_form_re/ || do {
1322 $response=$interfGet;
1323 die "Interface request failed";
1326 $searchGet = $ua->post($self->_search_uri, [@query, @commands, @search_pms, id
=>$self->_session_id]);
1327 $searchGet->is_success || do {
1328 $response = $searchGet;
1329 die "Search failed";
1331 $response = $searchGet;
1332 for ($searchGet->content) {
1333 /$no_seqs_found_re/ && do {
1334 die "No sequences found";
1337 /$too_many_re/ && do {
1338 die "Too many records ($1): must be <10000";
1341 /$tbl_no_join_re/ && do {
1342 die "Some required tables went unjoined to query";
1345 /$seqs_found_re/ && do {
1352 die "Search failed (response not parsed)";
1355 $response = $ua->post($self->_search_uri, [@download_pms, id
=>$self->_session_id]);
1356 $response->is_success || die "Query failed";
1357 # $response->content is a tab-separated value table of sequences
1358 # and metadata, first line starts with \# and contains fieldnames
1360 $self->_lanl_response($response);
1361 # throw, if necessary
1363 ($@
!~ "No sequences found") && do {
1364 $self->throw(-class=>'Bio::WebError::Exception',
1371 $self->warn("No sequences found for this query") unless $count;
1372 $self->count($count);
1373 return 1; # made it.
1377 =head2 _parse_lanl_response
1379 Title : _parse_lanl_response
1380 Usage : $hiv_query->_parse_lanl_response()
1381 Function: Parse the tab-separated-value response obtained by _do_lanl_request
1382 for sequence ids, accessions, and annotations
1384 Returns : 1 if successful
1389 sub _parse_lanl_response
{
1391 ### handle parsing and merging multiple responses into the query object
1392 ### (ids and annotations)
1395 my ($seqGet) = (@_);
1396 my (@data, @cols, %antbl, %antype,%anxlt, @ankeys );
1398 my ($schema, @retseqs, %rec, $ac);
1400 'country' => 'sample_country',
1401 'coreceptor' => 'second_receptor',
1402 'patient health' => 'health_status',
1403 'year' => 'sample_year'
1406 $schema = $self->_schema;
1408 $self->_lanl_response ||
1409 $self->throw(-class=>"Bio::QueryNotMade::Exception",
1410 -text
=>"Query not yet performed; call _do_lanl_request()",
1412 foreach my $rsp (@
{$self->_lanl_response}) {
1413 @data = split(/\r|\n/, $rsp->content);
1414 $numseq += ( shift(@data) =~ /Number.*:\s([0-9]+)/ )[0];
1415 @cols = split(/\t/, shift @data);
1417 # mappings from column headings to annotation keys
1418 # squish into hash keys
1419 my %q = @
{ shift @
{$self->_lanl_query} };
1420 %antbl = $schema->ankh(keys %q);
1421 foreach (values %antbl) {
1423 my $k = $_->{ankey
};
1426 $_->{ankey
} = $k; #replace with normalized version
1427 $antype{$k} = $_->{antype
};
1430 foreach (@cols) { #these are the data column headers
1434 ### conversion kludge for specials
1435 ### (i.e.,column headers that do not match the
1436 ### true field names)
1437 $c = $specials{$c} if (grep /$c/, keys %specials);
1440 ### following line grep: looks for a match of the
1441 ### column name at the end of the true field names to
1442 ### make the translation...
1443 ### only captures the first match.
1444 my ($match_fld) = grep (/$c$/i, keys %antbl);
1445 $anxlt{$_} = $antbl{$match_fld}->{ankey
} if $match_fld;
1448 @rec{@cols} = split /\t/;
1449 my $id = $rec{'SE_id'};
1452 $ac = new Bio
::Annotation
::Collection
();
1455 # need to handle reference, comment, dblink annots
1457 #accession should be added in here as a matter of course
1458 my $k = $anxlt{$_}; # annot key
1460 my $t = $antype{$k}; # annot type
1461 my $d = $rec{$_}; # the data
1462 $ac->put_value(-KEYS
=>[$t, $k], -VALUE
=>$d) if $k;
1464 $self->add_annotations_for_id($id, $ac);
1468 return 1; # made it.
1471 =head2 _parse_query_string
1473 Title : _parse_query_string
1474 Usage : $hiv_query->_parse_query_string($str)
1475 Function: Parses a query string using query language emulator QRY
1476 : in L<Bio::DB::Query::HIVQueryHelper>
1478 Returns : arrayref of hash structures suitable for passing to _create_lanl_query
1479 Args : a string scalar
1483 sub _parse_query_string
{
1485 my $qstring = shift;
1487 #syntax errors thrown in QRY (in HIVQueryHelper module)
1488 $ptree = QRY
::_parse_q
( $qstring );
1489 @ret = QRY
::_make_q
($ptree);
1498 Usage : $hiv_query->_sorry("-president=>Powell")
1499 Function: Throws an exception for unsupported option or parameter
1502 Args : scalar string
1509 $self->throw(-class=>"Bio::HIVSorry::Exception",
1510 -text
=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.",