1 # $Id: HIV.pm 232 2008-12-11 14:51:51Z maj $
3 # BioPerl module for Bio::DB::HIV
5 # Cared for by Mark A. Jensen <maj@fortinbras.us>
7 # Copyright Mark A. Jensen
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::DB::HIV - Database object interface to the Los Alamos HIV Sequence Database
19 $db = new Bio::DB::HIV;
21 $seq = $db->get_Seq_by_id('94284'); # LANL sequence id
22 $seq = $db->get_Seq_by_acc('EF432710'); # GenBank accession
24 $q = new Bio::DB::Query::HIVQuery( " (C D)[subtype] SI[phenotype] (symptomatic AIDS)[patient_health] " );
26 $seqio = $db->get_Stream_by_query($q);
27 $seq = $seqio->next_seq();
28 ($seq->annotation->get_Annotations('Virus'))[0]->{subtype} # returns 'D'
29 ($seq->annotation->get_Annotations('Patient'))[0]->{patient_health} # returns 'AIDS'
30 ($seq->annotation->get_Annotations('accession'))[0]->{value} # returns 'K03454'
34 Bio::DB::HIV, along with L<Bio::DB::Query::HIVQuery>, provides an
35 interface for obtaining annotated HIV and SIV sequences from the Los
36 Alamos National Laboratory (LANL) HIV Sequence Database (
37 L<http://www.hiv.lanl.gov/content/sequence/HIV/mainpage.html>
38 ). Unannotated sequences can be retrieved directly from the database
39 object, using either LANL ids or GenBank accessions. Annotations are
40 obtained via a query object, and are attached to the correct C<Bio::Seq>
41 objects when the query is handled by C<Bio::DB::HIV::get_Seq_by_query>
42 or C<Bio::DB::HIV::get_Stream_by_query>.
48 User feedback is an integral part of the evolution of this and other
49 Bioperl modules. Send your comments and suggestions preferably to
50 the Bioperl mailing list. Your participation is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Mark A. Jensen
65 Email maj@fortinbras.us
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
76 # Let the code begin...
81 use vars
qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH );
83 # Object preamble - inherits from Bio::DB::WebDBSeqI
86 use HTTP::Request::Common;
87 use Bio::DB::HIV::HIVAnnotProcessor;
89 use base qw(Bio::DB::WebDBSeqI);
93 $LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/advanced_search";
94 $LANL_MAP_DB = "map_db.comp";
95 $LANL_MAKE_SEARCH_IF = "make_search_if.comp";
96 $LANL_SEARCH = "search.comp";
97 @Bio::ResponseProblem
::Exception
::ISA
= qw( Bio::Root::Exception );
98 @Bio::HIVSorry
::Exception
::ISA
= qw
( Bio
::Root
::Exception
);
99 @Bio::WebError
::Exception
::ISA
= qw( Bio::Root::Exception );
107 Usage : my $obj = new Bio::DB::HIV();
108 Function: Builds a new Bio::DB::HIV object
109 Returns : an instance of Bio::DB::HIV
115 my($class,@args) = @_;
117 my $self = $class->SUPER::new
(@args);
118 my ($lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search) =
119 $self->_rearrange([qw(
126 $lanl_base && $self->lanl_base($lanl_base);
127 $lanl_map_db && $self->map_db($lanl_map_db);
128 $lanl_make_search_if && $self->make_search_if($lanl_make_search_if);
129 $lanl_search && $self->search_($lanl_search);
131 $self->lanl_base || $self->lanl_base($LANL_BASE);
132 $self->map_db || $self->map_db($LANL_MAP_DB);
133 $self->make_search_if || $self->make_search_if($LANL_MAKE_SEARCH_IF);
134 $self->search_ || $self->search_($LANL_SEARCH);
135 $self->url_base_address || $self->url_base_address($self->lanl_base);
137 $self->request_format("fasta");
142 =head1 WebDBSeqI compliance
147 Usage : my $url = $self->get_request
148 Function: returns a HTTP::Request object
150 Args : %qualifiers = a hash of qualifiers with keys in
151 (-ids, -format, -mode, -query)
152 Note : Several layers of requests are performed to get to the sequence;
153 see Bio::DB::Query::HIVQuery.
161 my (@ids, $mode, @interface, @query_parms, $query);
163 # html parsing regexps
164 my $tags_re = qr{(?:\s*<[^>]+>\s*)};
165 my $session_id_re = qr{<input.*name="id".*value="([0-9a-f]+)"}m;
166 my $search_form_re = qr{<form[^>]*action=".*/search.comp"};
167 my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found};
168 my $no_seqs_found_re = qr{Sorry.*no sequences found};
170 # handle "qualifiers"
171 foreach (keys %quals) {
177 $self->throw(-class=>"Bio::Root::BadParameter",
178 -text
=>"Arrayref required for qualifier \"$_\"",
179 -value
=>$quals{$_}) unless ref($quals{$_}) eq 'ARRAY';
180 @ids = @
{$quals{$_}};
184 $self->throw(-class=>"Bio::Root::BadParameter",
185 -text
=>"Bio::DB::Query::HIVQuery required for qualifier \"$_\"",
186 -value
=>$quals{$_}) unless $quals{$_}->isa("Bio::DB::Query::HIVQuery");
194 # what kind of request?
196 ($m =~ m/single/) && do {
198 'SequenceEntry' => 'SE_Sequence',
199 'SequenceEntry' => 'SE_id',
200 'action' => 'Search Interface'
202 @query_parms = map { ('SequenceEntry.SE_id' => $_ ) } @ids;
204 'SequenceEntry.SE_Sequence'=>'Any',
205 'order' => 'SequenceEntry.SE_id',
210 ($mode =~ m/acc/) && do {
212 'SequenceEntry' => 'SE_Sequence',
213 'SequenceEntry' => 'SE_id',
214 'SequenceAccessions' => 'SA_GenBankAccession',
215 'SequenceAccessions' => 'SA_SE_id',
216 'action' => 'Search Interface'
218 @query_parms = map {('SequenceAccessions.SA_GenBankAccession' => $_)} @ids;
220 'SequenceEntry.SE_Sequence' => 'Any',
221 'order' => 'SequenceAccessions.SA_GenBankAccession',
226 ($mode =~ m/gi/) && do {
227 $self->_sorry("-mode=>gi");
229 ($mode =~ m/version/) && do {
230 $self->_sorry("-mode=>version");
232 ($mode =~ m/query/) && do {
233 $self->throw(-class=>"Bio::Root::BadParameter",
234 -text
=>"Query ".($query->{'_RUN_LEVEL'} ?
"has been run only at run level ".$query->{'_RUN_LEVEL'} : "has not been run").", run at level 2 with _do_query(2)",
235 -value
=>$query->{'_RUN_LEVEL'}) unless $query->{'_RUN_LEVEL'} == 2;
237 'SequenceEntry' => 'SE_Sequence',
238 'SequenceEntry' => 'SE_id',
239 'action' => 'Search Interface'
241 @query_parms = map { ( "SequenceEntry.SE_id" => $_ ) } $query->ids;
243 'SequenceEntry.SE_Sequence' => 'Any',
244 'order' => 'SequenceEntry.SE_id',
254 eval { # capture web errors; throw below...
255 # negotiate a session with lanl db
256 if (!$self->_session_id) {
257 $resp = $self->ua->get($self->_map_db_uri);
258 $resp->is_success or die "Connect failed";
260 if (!$self->_session_id) {
261 ($self->{'_session_id'}) = ($resp->content =~ /$session_id_re/);
262 $self->_session_id or die "Session not established";
266 # establish correct "interface" for this session id
267 $resp = $self->ua->post($self->_make_search_if_uri, [@interface, id
=>$self->_session_id]);
268 $resp->is_success or die "Interface request failed (1)";
269 $resp->content =~ /$search_form_re/ or die "Interface request failed (2)";
271 # interface successful, do the "pre-search"
272 $resp = $self->ua->post($self->_search_uri, [@query_parms, 'id' => $self->_session_id]);
273 $resp->is_success or die "Search post failed";
274 ($resp->content !~ /$no_seqs_found_re/) or die "No sequences found";
275 ($resp->content =~ /$seqs_found_re/) or die "Unparsed failure";
277 $self->throw(-class=>'Bio::WebError::Exception',
281 # "pre-search" successful, return request
283 return POST
$self->_search_uri,
284 ['action Download.x' => 1,
285 'action Download.y'=>1,
286 'id'=>$self->_session_id
292 =head2 postprocess_data
294 Title : postprocess_data
295 Usage : $self->postprocess_data ( 'type' => 'string',
296 'location' => \$datastr);
297 Function: process downloaded data before loading into a Bio::SeqIO
299 Args : hash with two keys - 'type' can be 'string' or 'file'
300 - 'location' either file location or string
301 reference containing data
305 sub postprocess_data
{
306 # parse tab-separated value content from LANL db
307 my ( $self, %args) = @_;
308 my ($type, $loc) = ($args{type
}, $args{location
});
309 my (@data, @cols, %rec, $idkey, @flines);
310 $self->throw(-class=>'Bio::Root::BadParameter',
311 -text
=>"Argument hash requires values for keys \"type\" and \"location\"",
312 -value
=>\
%args) unless ($type && $loc);
315 @data = split(/\n|\r/, ${$loc});
321 open (my $F, "<", $loc) or
323 -class=>'Bio::Root::FileOpenException',
324 -text
=>"Error opening tempfile \"$loc\" for reading",
327 @data = split( /\n|\r/, <F
>);
335 $self->throw(-class=>'Bio::Root::BadParameter',
336 -text
=>'No data found in repsonse',
337 -value
=>%args) unless (@data);
338 shift @data; # number-returned line
339 @cols = split( /\t/, shift @data);
341 # if Accession column is present, get_Stream_by_acc was called
342 # otherwise, return lanl ids
343 ($idkey) = grep /SE.id/, @cols unless ($idkey) = grep /Accession/, @cols;
344 $self->throw(-class=>"Bio::ResponseProblem::Exception",
345 -text
=>"Trouble with column headers in LANL response",
346 -value
=>\
@cols) unless $idkey;
350 @rec{@cols} = split /\t/;
351 push @flines, ">$rec{$idkey}\n".$rec{'Sequence'}."\n";
355 ${$loc} = join("", @flines);
359 open(F
, ">", $loc) or $self->throw(-class=>'Bio::Root::FileOpenException',
360 -text
=>'Error opening tempfile \"$loc\" for writing',
362 print F
join("", @flines);
373 =head1 WebDBSeqI overrides
375 =head2 get_seq_stream
377 Title : get_seq_stream
378 Usage : my $seqio = $self->get_seq_stream(%qualifiers)
379 Function: builds a url and queries a web db
380 Returns : a Bio::SeqIO stream capable of producing sequence
381 Args : %qualifiers = a hash qualifiers that the implementing class
382 will process to make a url suitable for web querying
383 Note : Some tightening up of the baseclass version
388 my ($self, %qualifiers) = @_;
389 my ($rformat, $ioformat) = $self->request_format();
391 my ($key) = grep /format$/, keys %qualifiers;
392 $qualifiers{'-format'} = ($key ?
$qualifiers{$key} : $rformat);
393 ($rformat, $ioformat) = $self->request_format($qualifiers{'format'});
395 # web work is here/maj
396 my $request = $self->get_request(%qualifiers);
398 # authorization is here/maj
399 $request->proxy_authorization_basic($self->authentication)
400 if ( $self->authentication);
401 $self->debug("request is ". $request->as_string(). "\n");
403 # workaround for MSWin systems (no forking available/maj)
404 $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O
=~ /^MSWin/;
406 if ($self->retrieval_type =~ /pipeline/) {
407 # Try to create a stream using POSIX fork-and-pipe facility.
408 # this is a *big* win when fetching thousands of sequences from
409 # a web database because we can return the first entry while
410 # transmission is still in progress.
411 # Also, no need to keep sequence in memory or in a temporary file.
412 # If this fails (Windows, MacOS 9), we fall back to non-pipelined access.
414 # fork and pipe: _stream_request()=><STREAM>
415 my ($result,$stream) = $self->_open_pipe();
417 if (defined $result) {
418 $DB::fork_TTY
= File
::Spec
->devnull; # prevents complaints from debugge
419 if (!$result) { # in child process
420 $self->_stream_request($request,$stream);
421 POSIX
::_exit
(0); #prevent END blocks from executing in this forked child
424 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
425 '-format' => $ioformat,
430 $self->retrieval_type('io_string');
434 if ($self->retrieval_type =~ /temp/i) {
435 my $dir = $self->io->tempdir( CLEANUP
=> 1);
436 my ( $fh, $tmpfile) = $self->io()->tempfile( DIR
=> $dir );
438 my $resp = $self->_request($request, $tmpfile);
439 if( ! -e
$tmpfile || -z
$tmpfile || ! $resp->is_success() ) {
440 $self->throw("WebDBSeqI Error - check query sequences!\n");
442 $self->postprocess_data('type' => 'file','location' => $tmpfile);
443 # this may get reset when requesting batch mode
444 ($rformat,$ioformat) = $self->request_format();
445 if( $self->verbose > 0 ) {
446 open(my $ERR, "<", $tmpfile);
447 while(<$ERR>) { $self->debug($_);}
450 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
451 '-format' => $ioformat,
452 '-file' => $tmpfile);
455 if ($self->retrieval_type =~ /io_string/i ) {
456 my $resp = $self->_request($request);
457 my $content = $resp->content_ref;
458 $self->debug( "content is $$content\n");
459 if (!$resp->is_success() || length($$content) == 0) {
460 $self->throw("WebDBSeqI Error - check query sequences!\n");
462 ($rformat,$ioformat) = $self->request_format();
463 $self->postprocess_data('type'=> 'string',
464 'location' => $content);
465 $self->debug( "str is $$content\n");
466 return Bio
::SeqIO
->new('-verbose' => $self->verbose,
467 '-format' => $ioformat,
468 '-fh' => new IO
::String
($$content));
471 # if we got here, we don't know how to handle the retrieval type
472 $self->throw("retrieval type " .
473 $self->retrieval_type .
477 =head2 get_Stream_by_acc
479 Title : get_Stream_by_acc
480 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
481 Function: Gets a series of Seq objects by GenBank accession numbers
482 Returns : a Bio::SeqIO stream object
483 Args : an arrayref of accession numbers for
484 the desired sequence entries
485 Note : For LANL DB, alternative to LANL seqids
489 sub get_Stream_by_acc
{
490 my ($self, $ids ) = @_;
491 return $self->get_seq_stream('-uids' => [$ids], '-mode' => 'acc');
494 =head2 get_Stream_by_query
496 Title : get_Stream_by_query
497 Usage : $stream = $db->get_Stream_by_query($query);
498 Function: Gets a series of Seq objects by way of a query string or oject
499 Returns : a Bio::SeqIO stream object
500 Args : $query : Currently, only a Bio::DB::Query::HIVQuery object.
501 It's a good idea to create the query object first and interrogate
502 it for the entry count before you fetch a potentially large stream.
506 sub get_Stream_by_query
{
507 my ($self, $query ) = @_;
508 my $stream = $self->get_seq_stream('-query' => $query, '-mode'=>'query');
509 return new Bio
::DB
::HIV
::HIVAnnotProcessor
( -hiv_query
=>$query, -source_stream
=>$stream );
513 my ($self, $request,$tmpfile) = @_;
516 if( defined $tmpfile && $tmpfile ne '' ) {
517 $resp = $self->ua->request($request, $tmpfile);
519 $resp = $self->ua->request($request);
522 if( $resp->is_error ) {
523 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
533 Usage : $obj->lanl_base($newval)
534 Function: get/set the base url of the LANL HIV database
536 Returns : value of lanl_base (a scalar)
537 Args : on set, new value (a scalar or undef, optional)
544 return $self->{'lanl_base'} = shift if @_;
545 return $self->{'lanl_base'};
551 Usage : $obj->map_db($newval)
552 Function: get/set the cgi filename for map_db ("Database Map")
554 Returns : value of map_db (a scalar)
555 Args : on set, new value (a scalar or undef, optional)
562 return $self->{'map_db'} = shift if @_;
563 return $self->{'map_db'};
566 =head2 make_search_if
568 Title : make_search_if
569 Usage : $obj->make_search_if($newval)
570 Function: get/set the cgi filename for make_search_if ("Make Search Interface")
572 Returns : value of make_search_if (a scalar)
573 Args : on set, new value (a scalar or undef, optional)
580 return $self->{'make_search_if'} = shift if @_;
581 return $self->{'make_search_if'};
587 Usage : $obj->search_($newval)
588 Function: get/set the cgi filename for the search query page
591 Returns : value of search_ (a scalar)
592 Args : on set, new value (a scalar or undef, optional)
599 return $self->{'search_'} = shift if @_;
600 return $self->{'search_'};
607 Function: return the full map_db uri ("Database Map")
609 Returns : scalar string
616 return $self->url_base_address."/".$self->map_db;
620 =head2 _make_search_if_uri
622 Title : _make_search_if_uri
624 Function: return the full make_search_if uri ("Make Search Interface")
626 Returns : scalar string
631 sub _make_search_if_uri
{
633 return $self->url_base_address."/".$self->make_search_if;
640 Function: return the full search cgi uri ("Search Database")
642 Returns : scalar string
649 return $self->url_base_address."/".$self->search_;
655 Usage : $obj->_session_id($newval)
656 Function: Contains HIV db session id (initialized in _do_lanl_request)
658 Returns : value of _session_id (a scalar)
659 Args : on set, new value (a scalar or undef, optional)
666 return $self->{'_session_id'} = shift if @_;
667 return $self->{'_session_id'};
674 Function: Throws an exception for unsupported option or parameter
684 $self->throw(-class=>"Bio::HIVSorry::Exception",
685 -text
=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.",