[bug 2637]
[bioperl-live.git] / Bio / DB / HIV.pm
blobbbaa7a1dccd3ec5872bcccfee2aa1afbfdc59f4b
1 # $Id: HIV.pm 232 2008-12-11 14:51:51Z maj $
3 # BioPerl module for Bio::DB::HIV
5 # Cared for by Mark A. Jensen <maj@fortinbras.us>
7 # Copyright Mark A. Jensen
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::DB::HIV - Database object interface to the Los Alamos HIV Sequence Database
17 =head1 SYNOPSIS
19 $db = new Bio::DB::HIV;
21 $seq = $db->get_Seq_by_id('94284'); # LANL sequence id
22 $seq = $db->get_Seq_by_acc('EF432710'); # GenBank accession
24 $q = new Bio::DB::Query::HIVQuery( " (C D)[subtype] SI[phenotype] (symptomatic AIDS)[patient_health] " );
26 $seqio = $db->get_Stream_by_query($q);
27 $seq = $seqio->next_seq();
28 ($seq->annotation->get_Annotations('Virus'))[0]->{subtype} # returns 'D'
29 ($seq->annotation->get_Annotations('Patient'))[0]->{patient_health} # returns 'AIDS'
30 ($seq->annotation->get_Annotations('accession'))[0]->{value} # returns 'K03454'
32 =head1 DESCRIPTION
34 Bio::DB::HIV, along with L<Bio::DB::Query::HIVQuery>, provides an
35 interface for obtaining annotated HIV and SIV sequences from the Los
36 Alamos National Laboratory (LANL) HIV Sequence Database (
37 L<http://www.hiv.lanl.gov/content/sequence/HIV/mainpage.html>
38 ). Unannotated sequences can be retrieved directly from the database
39 object, using either LANL ids or GenBank accessions. Annotations are
40 obtained via a query object, and are attached to the correct C<Bio::Seq>
41 objects when the query is handled by C<Bio::DB::HIV::get_Seq_by_query>
42 or C<Bio::DB::HIV::get_Stream_by_query>.
44 =head1 FEEDBACK
46 =head2 Mailing Lists
48 User feedback is an integral part of the evolution of this and other
49 Bioperl modules. Send your comments and suggestions preferably to
50 the Bioperl mailing list. Your participation is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 =head2 Reporting Bugs
57 Report bugs to the Bioperl bug tracking system to help us keep track
58 of the bugs and their resolution. Bug reports can be submitted via
59 the web:
61 http://bugzilla.open-bio.org/
63 =head1 AUTHOR - Mark A. Jensen
65 Email maj@fortinbras.us
67 =head1 CONTRIBUTORS
69 =head1 APPENDIX
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
74 =cut
76 # Let the code begin...
79 package Bio::DB::HIV;
80 use strict;
81 use vars qw( $LANL_BASE $LANL_MAP_DB $LANL_MAKE_SEARCH_IF $LANL_SEARCH );
83 # Object preamble - inherits from Bio::DB::WebDBSeqI
85 use Bio::Root::Root;
86 use HTTP::Request::Common;
87 use Bio::DB::HIV::HIVAnnotProcessor;
89 use base qw(Bio::DB::WebDBSeqI);
92 BEGIN {
93 $LANL_BASE = "http://www.hiv.lanl.gov/components/sequence/HIV/advanced_search";
94 $LANL_MAP_DB = "map_db.comp";
95 $LANL_MAKE_SEARCH_IF = "make_search_if.comp";
96 $LANL_SEARCH = "search.comp";
97 @Bio::ResponseProblem::Exception::ISA = qw( Bio::Root::Exception );
98 @Bio::HIVSorry::Exception::ISA = qw ( Bio::Root::Exception );
99 @Bio::WebError::Exception::ISA = qw( Bio::Root::Exception );
102 =head1 Constructor
104 =head2 new
106 Title : new
107 Usage : my $obj = new Bio::DB::HIV();
108 Function: Builds a new Bio::DB::HIV object
109 Returns : an instance of Bio::DB::HIV
110 Args :
112 =cut
114 sub new {
115 my($class,@args) = @_;
117 my $self = $class->SUPER::new(@args);
118 my ($lanl_base, $lanl_map_db, $lanl_make_search_if, $lanl_search) =
119 $self->_rearrange([qw(
120 LANL_BASE
121 LANL_MAP_DB
122 LANL_MAKE_SEARCH_IF
123 LANL_SEARCH
124 )], @args);
126 $lanl_base && $self->lanl_base($lanl_base);
127 $lanl_map_db && $self->map_db($lanl_map_db);
128 $lanl_make_search_if && $self->make_search_if($lanl_make_search_if);
129 $lanl_search && $self->search_($lanl_search);
130 # defaults
131 $self->lanl_base || $self->lanl_base($LANL_BASE);
132 $self->map_db || $self->map_db($LANL_MAP_DB);
133 $self->make_search_if || $self->make_search_if($LANL_MAKE_SEARCH_IF);
134 $self->search_ || $self->search_($LANL_SEARCH);
135 $self->url_base_address || $self->url_base_address($self->lanl_base);
137 $self->request_format("fasta");
139 return $self;
142 =head1 WebDBSeqI compliance
144 =head2 get_request
146 Title : get_request
147 Usage : my $url = $self->get_request
148 Function: returns a HTTP::Request object
149 Returns :
150 Args : %qualifiers = a hash of qualifiers with keys in
151 (-ids, -format, -mode, -query)
152 Note : Several layers of requests are performed to get to the sequence;
153 see Bio::DB::Query::HIVQuery.
155 =cut
157 sub get_request {
158 my $self = shift;
159 my %quals = @_;
160 my ($resp);
161 my (@ids, $mode, @interface, @query_parms, $query);
163 # html parsing regexps
164 my $tags_re = qr{(?:\s*<[^>]+>\s*)};
165 my $session_id_re = qr{<input.*name="id".*value="([0-9a-f]+)"}m;
166 my $search_form_re = qr{<form[^>]*action=".*/search.comp"};
167 my $seqs_found_re = qr{Displaying$tags_re*(?:\s*[0-9-]*\s*)*$tags_re*of$tags_re*\s*([0-9]+)$tags_re*sequences found};
168 my $no_seqs_found_re = qr{Sorry.*no sequences found};
170 # handle "qualifiers"
171 foreach (keys %quals) {
172 m/mode/ && do {
173 $mode = $quals{$_};
174 next;
176 m/uids/ && do {
177 $self->throw(-class=>"Bio::Root::BadParameter",
178 -text=>"Arrayref required for qualifier \"$_\"",
179 -value=>$quals{$_}) unless ref($quals{$_}) eq 'ARRAY';
180 @ids = @{$quals{$_}};
181 next;
183 m/query/ && do {
184 $self->throw(-class=>"Bio::Root::BadParameter",
185 -text=>"Bio::DB::Query::HIVQuery required for qualifier \"$_\"",
186 -value=>$quals{$_}) unless $quals{$_}->isa("Bio::DB::Query::HIVQuery");
187 $query = $quals{$_};
188 next;
190 do {
191 1; #else stub
194 # what kind of request?
195 for my $m ($mode) {
196 ($m =~ m/single/) && do {
197 @interface = (
198 'SequenceEntry' => 'SE_Sequence',
199 'SequenceEntry' => 'SE_id',
200 'action' => 'Search Interface'
202 @query_parms = map { ('SequenceEntry.SE_id' => $_ ) } @ids;
203 push @query_parms, (
204 'SequenceEntry.SE_Sequence'=>'Any',
205 'order' => 'SequenceEntry.SE_id',
206 'sort_dir' => 'ASC',
207 'action' => 'Search'
210 ($mode =~ m/acc/) && do {
211 @interface = (
212 'SequenceEntry' => 'SE_Sequence',
213 'SequenceEntry' => 'SE_id',
214 'SequenceAccessions' => 'SA_GenBankAccession',
215 'SequenceAccessions' => 'SA_SE_id',
216 'action' => 'Search Interface'
218 @query_parms = map {('SequenceAccessions.SA_GenBankAccession' => $_)} @ids;
219 push @query_parms, (
220 'SequenceEntry.SE_Sequence' => 'Any',
221 'order' => 'SequenceAccessions.SA_GenBankAccession',
222 'sort_dir' => 'ASC',
223 'action' => 'Search'
226 ($mode =~ m/gi/) && do {
227 $self->_sorry("-mode=>gi");
229 ($mode =~ m/version/) && do {
230 $self->_sorry("-mode=>version");
232 ($mode =~ m/query/) && do {
233 $self->throw(-class=>"Bio::Root::BadParameter",
234 -text=>"Query ".($query->{'_RUN_LEVEL'} ? "has been run only at run level ".$query->{'_RUN_LEVEL'} : "has not been run").", run at level 2 with _do_query(2)",
235 -value=>$query->{'_RUN_LEVEL'}) unless $query->{'_RUN_LEVEL'} == 2;
236 @interface = (
237 'SequenceEntry' => 'SE_Sequence',
238 'SequenceEntry' => 'SE_id',
239 'action' => 'Search Interface'
241 @query_parms = map { ( "SequenceEntry.SE_id" => $_ ) } $query->ids;
242 push @query_parms, (
243 'SequenceEntry.SE_Sequence' => 'Any',
244 'order' => 'SequenceEntry.SE_id',
245 'sort_dir' => 'ASC',
246 'action' => 'Search'
249 do {
250 1; # else stub
253 # web work
254 eval { # capture web errors; throw below...
255 # negotiate a session with lanl db
256 if (!$self->_session_id) {
257 $resp = $self->ua->get($self->_map_db_uri);
258 $resp->is_success or die "Connect failed";
259 # get the session id
260 if (!$self->_session_id) {
261 ($self->{'_session_id'}) = ($resp->content =~ /$session_id_re/);
262 $self->_session_id or die "Session not established";
266 # establish correct "interface" for this session id
267 $resp = $self->ua->post($self->_make_search_if_uri, [@interface, id=>$self->_session_id]);
268 $resp->is_success or die "Interface request failed (1)";
269 $resp->content =~ /$search_form_re/ or die "Interface request failed (2)";
271 # interface successful, do the "pre-search"
272 $resp = $self->ua->post($self->_search_uri, [@query_parms, 'id' => $self->_session_id]);
273 $resp->is_success or die "Search post failed";
274 ($resp->content !~ /$no_seqs_found_re/) or die "No sequences found";
275 ($resp->content =~ /$seqs_found_re/) or die "Unparsed failure";
277 $self->throw(-class=>'Bio::WebError::Exception',
278 -text=>$@,
279 -value=>"") if $@;
281 # "pre-search" successful, return request
283 return POST $self->_search_uri,
284 ['action Download.x' => 1,
285 'action Download.y'=>1,
286 'id'=>$self->_session_id
292 =head2 postprocess_data
294 Title : postprocess_data
295 Usage : $self->postprocess_data ( 'type' => 'string',
296 'location' => \$datastr);
297 Function: process downloaded data before loading into a Bio::SeqIO
298 Returns : void
299 Args : hash with two keys - 'type' can be 'string' or 'file'
300 - 'location' either file location or string
301 reference containing data
303 =cut
305 sub postprocess_data {
306 # parse tab-separated value content from LANL db
307 my ( $self, %args) = @_;
308 my ($type, $loc) = ($args{type}, $args{location});
309 my (@data, @cols, %rec, $idkey, @flines);
310 $self->throw(-class=>'Bio::Root::BadParameter',
311 -text=>"Argument hash requires values for keys \"type\" and \"location\"",
312 -value=>\%args) unless ($type && $loc);
313 for ($type) {
314 m/string/ && do {
315 @data = split(/\n|\r/, ${$loc});
316 last;
318 m/file/ && do {
319 local $/;
320 undef $/;
321 open (my $F, "<", $loc) or
322 $self->throw(
323 -class=>'Bio::Root::FileOpenException',
324 -text=>"Error opening tempfile \"$loc\" for reading",
325 -value=>$loc
327 @data = split( /\n|\r/, <F>);
328 close($F);
329 last;
331 do {
332 1; # else stub
335 $self->throw(-class=>'Bio::Root::BadParameter',
336 -text=>'No data found in repsonse',
337 -value=>%args) unless (@data);
338 shift @data; # number-returned line
339 @cols = split( /\t/, shift @data);
341 # if Accession column is present, get_Stream_by_acc was called
342 # otherwise, return lanl ids
343 ($idkey) = grep /SE.id/, @cols unless ($idkey) = grep /Accession/, @cols;
344 $self->throw(-class=>"Bio::ResponseProblem::Exception",
345 -text=>"Trouble with column headers in LANL response",
346 -value=>\@cols) unless $idkey;
348 foreach (@data) {
349 chop;
350 @rec{@cols} = split /\t/;
351 push @flines, ">$rec{$idkey}\n".$rec{'Sequence'}."\n";
353 for ($type) {
354 m/string/ && do {
355 ${$loc} = join("", @flines);
356 last;
358 m/file/ && do {
359 open(F, ">", $loc) or $self->throw(-class=>'Bio::Root::FileOpenException',
360 -text=>'Error opening tempfile \"$loc\" for writing',
361 -value=>$loc);
362 print F join("", @flines);
363 close(F);
364 last;
366 do {
367 1; #else stub
370 return;
373 =head1 WebDBSeqI overrides
375 =head2 get_seq_stream
377 Title : get_seq_stream
378 Usage : my $seqio = $self->get_seq_stream(%qualifiers)
379 Function: builds a url and queries a web db
380 Returns : a Bio::SeqIO stream capable of producing sequence
381 Args : %qualifiers = a hash qualifiers that the implementing class
382 will process to make a url suitable for web querying
383 Note : Some tightening up of the baseclass version
385 =cut
387 sub get_seq_stream {
388 my ($self, %qualifiers) = @_;
389 my ($rformat, $ioformat) = $self->request_format();
391 my ($key) = grep /format$/, keys %qualifiers;
392 $qualifiers{'-format'} = ($key ? $qualifiers{$key} : $rformat);
393 ($rformat, $ioformat) = $self->request_format($qualifiers{'format'});
395 # web work is here/maj
396 my $request = $self->get_request(%qualifiers);
398 # authorization is here/maj
399 $request->proxy_authorization_basic($self->authentication)
400 if ( $self->authentication);
401 $self->debug("request is ". $request->as_string(). "\n");
403 # workaround for MSWin systems (no forking available/maj)
404 $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/;
406 if ($self->retrieval_type =~ /pipeline/) {
407 # Try to create a stream using POSIX fork-and-pipe facility.
408 # this is a *big* win when fetching thousands of sequences from
409 # a web database because we can return the first entry while
410 # transmission is still in progress.
411 # Also, no need to keep sequence in memory or in a temporary file.
412 # If this fails (Windows, MacOS 9), we fall back to non-pipelined access.
414 # fork and pipe: _stream_request()=><STREAM>
415 my ($result,$stream) = $self->_open_pipe();
417 if (defined $result) {
418 $DB::fork_TTY = File::Spec->devnull; # prevents complaints from debugge
419 if (!$result) { # in child process
420 $self->_stream_request($request,$stream);
421 POSIX::_exit(0); #prevent END blocks from executing in this forked child
423 else {
424 return Bio::SeqIO->new('-verbose' => $self->verbose,
425 '-format' => $ioformat,
426 '-fh' => $stream);
429 else {
430 $self->retrieval_type('io_string');
434 if ($self->retrieval_type =~ /temp/i) {
435 my $dir = $self->io->tempdir( CLEANUP => 1);
436 my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir );
437 close $fh;
438 my $resp = $self->_request($request, $tmpfile);
439 if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) {
440 $self->throw("WebDBSeqI Error - check query sequences!\n");
442 $self->postprocess_data('type' => 'file','location' => $tmpfile);
443 # this may get reset when requesting batch mode
444 ($rformat,$ioformat) = $self->request_format();
445 if( $self->verbose > 0 ) {
446 open(my $ERR, "<", $tmpfile);
447 while(<$ERR>) { $self->debug($_);}
450 return Bio::SeqIO->new('-verbose' => $self->verbose,
451 '-format' => $ioformat,
452 '-file' => $tmpfile);
455 if ($self->retrieval_type =~ /io_string/i ) {
456 my $resp = $self->_request($request);
457 my $content = $resp->content_ref;
458 $self->debug( "content is $$content\n");
459 if (!$resp->is_success() || length($$content) == 0) {
460 $self->throw("WebDBSeqI Error - check query sequences!\n");
462 ($rformat,$ioformat) = $self->request_format();
463 $self->postprocess_data('type'=> 'string',
464 'location' => $content);
465 $self->debug( "str is $$content\n");
466 return Bio::SeqIO->new('-verbose' => $self->verbose,
467 '-format' => $ioformat,
468 '-fh' => new IO::String($$content));
471 # if we got here, we don't know how to handle the retrieval type
472 $self->throw("retrieval type " .
473 $self->retrieval_type .
474 " unsupported\n");
477 =head2 get_Stream_by_acc
479 Title : get_Stream_by_acc
480 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
481 Function: Gets a series of Seq objects by GenBank accession numbers
482 Returns : a Bio::SeqIO stream object
483 Args : an arrayref of accession numbers for
484 the desired sequence entries
485 Note : For LANL DB, alternative to LANL seqids
487 =cut
489 sub get_Stream_by_acc {
490 my ($self, $ids ) = @_;
491 return $self->get_seq_stream('-uids' => [$ids], '-mode' => 'acc');
494 =head2 get_Stream_by_query
496 Title : get_Stream_by_query
497 Usage : $stream = $db->get_Stream_by_query($query);
498 Function: Gets a series of Seq objects by way of a query string or oject
499 Returns : a Bio::SeqIO stream object
500 Args : $query : Currently, only a Bio::DB::Query::HIVQuery object.
501 It's a good idea to create the query object first and interrogate
502 it for the entry count before you fetch a potentially large stream.
504 =cut
506 sub get_Stream_by_query {
507 my ($self, $query ) = @_;
508 my $stream = $self->get_seq_stream('-query' => $query, '-mode'=>'query');
509 return new Bio::DB::HIV::HIVAnnotProcessor( -hiv_query=>$query, -source_stream=>$stream );
512 sub _request {
513 my ($self, $request,$tmpfile) = @_;
514 my ($resp);
516 if( defined $tmpfile && $tmpfile ne '' ) {
517 $resp = $self->ua->request($request, $tmpfile);
518 } else {
519 $resp = $self->ua->request($request);
522 if( $resp->is_error ) {
523 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
525 return $resp;
528 =head1 Internals
530 =head2 lanl_base
532 Title : lanl_base
533 Usage : $obj->lanl_base($newval)
534 Function: get/set the base url of the LANL HIV database
535 Example :
536 Returns : value of lanl_base (a scalar)
537 Args : on set, new value (a scalar or undef, optional)
539 =cut
541 sub lanl_base{
542 my $self = shift;
544 return $self->{'lanl_base'} = shift if @_;
545 return $self->{'lanl_base'};
548 =head2 map_db
550 Title : map_db
551 Usage : $obj->map_db($newval)
552 Function: get/set the cgi filename for map_db ("Database Map")
553 Example :
554 Returns : value of map_db (a scalar)
555 Args : on set, new value (a scalar or undef, optional)
557 =cut
559 sub map_db{
560 my $self = shift;
562 return $self->{'map_db'} = shift if @_;
563 return $self->{'map_db'};
566 =head2 make_search_if
568 Title : make_search_if
569 Usage : $obj->make_search_if($newval)
570 Function: get/set the cgi filename for make_search_if ("Make Search Interface")
571 Example :
572 Returns : value of make_search_if (a scalar)
573 Args : on set, new value (a scalar or undef, optional)
575 =cut
577 sub make_search_if{
578 my $self = shift;
580 return $self->{'make_search_if'} = shift if @_;
581 return $self->{'make_search_if'};
584 =head2 search_
586 Title : search_
587 Usage : $obj->search_($newval)
588 Function: get/set the cgi filename for the search query page
589 ("Search Database")
590 Example :
591 Returns : value of search_ (a scalar)
592 Args : on set, new value (a scalar or undef, optional)
594 =cut
596 sub search_{
597 my $self = shift;
599 return $self->{'search_'} = shift if @_;
600 return $self->{'search_'};
603 =head2 _map_db_uri
605 Title : _map_db_uri
606 Usage :
607 Function: return the full map_db uri ("Database Map")
608 Example :
609 Returns : scalar string
610 Args : none
612 =cut
614 sub _map_db_uri{
615 my $self = shift;
616 return $self->url_base_address."/".$self->map_db;
620 =head2 _make_search_if_uri
622 Title : _make_search_if_uri
623 Usage :
624 Function: return the full make_search_if uri ("Make Search Interface")
625 Example :
626 Returns : scalar string
627 Args : none
629 =cut
631 sub _make_search_if_uri{
632 my $self = shift;
633 return $self->url_base_address."/".$self->make_search_if;
636 =head2 _search_uri
638 Title : _search_uri
639 Usage :
640 Function: return the full search cgi uri ("Search Database")
641 Example :
642 Returns : scalar string
643 Args : none
645 =cut
647 sub _search_uri{
648 my $self = shift;
649 return $self->url_base_address."/".$self->search_;
652 =head2 _session_id
654 Title : _session_id
655 Usage : $obj->_session_id($newval)
656 Function: Contains HIV db session id (initialized in _do_lanl_request)
657 Example :
658 Returns : value of _session_id (a scalar)
659 Args : on set, new value (a scalar or undef, optional)
661 =cut
663 sub _session_id{
664 my $self = shift;
666 return $self->{'_session_id'} = shift if @_;
667 return $self->{'_session_id'};
670 =head2 Dude, sorry
672 Title : _sorry
673 Usage : $hiv->_sorry
674 Function: Throws an exception for unsupported option or parameter
675 Example :
676 Returns :
677 Args : scalar string
679 =cut
681 sub _sorry{
682 my $self = shift;
683 my $parm = shift;
684 $self->throw(-class=>"Bio::HIVSorry::Exception",
685 -text=>"Sorry, option/parameter \"$parm\" not (yet) supported. See manpage to complain.",
686 -value=>$parm);
687 return;