Set the EUtils URL in NCBIHelper, inherit in entrez.pm and GenBank.pm, t/RemoteDB...
[bioperl-live.git] / Bio / DB / NCBIHelper.pm
blob9d3b261d83212df1a0537d386acbf0634fc9ff47
2 # BioPerl module for Bio::DB::NCBIHelper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # Interfaces with new WebDBSeqI interface
16 =head1 NAME
18 Bio::DB::NCBIHelper - A collection of routines useful for queries to
19 NCBI databases.
21 =head1 SYNOPSIS
23 # Do not use this module directly.
25 # get a Bio::DB::NCBIHelper object somehow
26 my $seqio = $db->get_Stream_by_acc(['J00522']);
27 foreach my $seq ( $seqio->next_seq ) {
28 # process seq
31 =head1 DESCRIPTION
33 Provides a single place to setup some common methods for querying NCBI
34 web databases. This module just centralizes the methods for
35 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
36 common HTML stripping done in L<postprocess_data>().
38 The base NCBI query URL used is:
39 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the
46 evolution of this and other Bioperl modules. Send
47 your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation
49 is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to
68 help us keep track the bugs and their resolution.
69 Bug reports can be submitted via the web.
71 https://redmine.open-bio.org/projects/bioperl/
73 =head1 AUTHOR - Jason Stajich
75 Email jason@bioperl.org
77 =head1 APPENDIX
79 The rest of the documentation details each of the
80 object methods. Internal methods are usually
81 preceded with a _
83 =cut
85 # Let the code begin...
87 package Bio::DB::NCBIHelper;
88 use strict;
89 use vars qw($HOSTBASE %CGILOCATION %FORMATMAP $DEFAULTFORMAT $MAX_ENTRIES);
91 use Bio::DB::Query::GenBank;
92 use HTTP::Request::Common;
93 use URI;
94 use Bio::Root::IO;
95 use Bio::DB::RefSeq;
96 use URI::Escape qw(uri_unescape);
98 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
100 BEGIN {
101 $MAX_ENTRIES = 19000;
102 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
103 %CGILOCATION = (
104 'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
105 'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
106 'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
107 'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
108 'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
109 'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ]
112 %FORMATMAP = (
113 'gb' => 'genbank',
114 'gp' => 'genbank',
115 'fasta' => 'fasta',
116 'asn.1' => 'entrezgene',
117 'gbwithparts' => 'genbank',
119 $DEFAULTFORMAT = 'gb';
123 # the new way to make modules a little more lightweight
125 sub new {
126 my ($class, @args ) = @_;
127 my $self = $class->SUPER::new(@args);
128 my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
129 $self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
130 @args);
131 $seq_start && $self->seq_start($seq_start);
132 $seq_stop && $self->seq_stop($seq_stop);
133 $no_redirect && $self->no_redirect($no_redirect);
134 $redirect && $self->redirect_refseq($redirect);
135 $strand && $self->strand($strand);
136 # adjust statement to accept zero value
137 defined $complexity && ($complexity >=0 && $complexity <=4)
138 && $self->complexity($complexity);
139 return $self;
143 =head2 get_params
145 Title : get_params
146 Usage : my %params = $self->get_params($mode)
147 Function: Returns key,value pairs to be passed to NCBI database
148 for either 'batch' or 'single' sequence retrieval method
149 Returns : a key,value pair hash
150 Args : 'single' or 'batch' mode for retrieval
152 =cut
154 sub get_params {
155 my ($self, $mode) = @_;
156 $self->throw("subclass did not implement get_params");
159 =head2 default_format
161 Title : default_format
162 Usage : my $format = $self->default_format
163 Function: Returns default sequence format for this module
164 Returns : string
165 Args : none
167 =cut
169 sub default_format {
170 return $DEFAULTFORMAT;
173 =head2 get_request
175 Title : get_request
176 Usage : my $url = $self->get_request
177 Function: HTTP::Request
178 Returns :
179 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
181 =cut
183 sub get_request {
184 my ($self, @qualifiers) = @_;
185 my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
186 $self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
187 @qualifiers);
188 $mode = lc $mode;
189 ($format) = $self->request_format() unless ( defined $format);
190 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
191 my %params = $self->get_params($mode);
192 if( ! %params ) {
193 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
195 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
196 unless( $mode eq 'webenv' || defined $uids || defined $query) {
197 $self->throw("Must specify a query or list of uids to fetch");
199 if ($query && $query->can('cookie')) {
200 @params{'WebEnv','query_key'} = $query->cookie;
201 $params{'db'} = $query->db;
203 elsif ($query) {
204 $params{'id'} = join ',',$query->ids;
206 # for batch retrieval, non-query style
207 elsif ($mode eq 'webenv' && $self->can('cookie')) {
208 @params{'WebEnv','query_key'} = $self->cookie;
210 elsif ($uids) {
211 if( ref($uids) =~ /array/i ) {
212 $uids = join(",", @$uids);
214 $params{'id'} = $uids;
216 $seq_start && ($params{'seq_start'} = $seq_start);
217 $seq_stop && ($params{'seq_stop'} = $seq_stop);
218 $strand && ($params{'strand'} = $strand);
219 if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
220 $self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
221 if ($complexity != 1 && ($seq_start || $seq_stop));
222 $self->warn("Complexity set to 0; expect strange results with strand set to 2")
223 if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
225 defined $complexity && ($params{'complexity'} = $complexity);
226 $params{'rettype'} = $format unless $mode eq 'batch';
227 # for now, 'post' is batch retrieval
228 if ($CGILOCATION{$mode}[0] eq 'post') {
229 my $response = $self->ua->request(POST $url,[%params]);
230 $response->proxy_authorization_basic($self->authentication)
231 if ( $self->authentication);
232 $self->_parse_response($response->content);
233 my ($cookie, $querykey) = $self->cookie;
234 my %qualifiers = ('-mode' => 'webenv',
235 '-seq_start' => $seq_start,
236 '-seq_stop' => $seq_stop,
237 '-strand' => $strand,
238 '-complexity' => $complexity,
239 '-format' => $format);
240 return $self->get_request(%qualifiers);
241 } else {
242 $url->query_form(%params);
243 return GET $url;
247 =head2 get_Stream_by_batch
249 Title : get_Stream_by_batch
250 Usage : $seq = $db->get_Stream_by_batch($ref);
251 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
252 at a time. For large numbers of sequences, this is far superior
253 than get_Stream_by_[id/acc]().
254 Example :
255 Returns : a Bio::SeqIO stream object
256 Args : $ref : either an array reference, a filename, or a filehandle
257 from which to get the list of unique ids/accession numbers.
259 NOTE: deprecated API. Use get_Stream_by_id() instead.
261 =cut
263 *get_Stream_by_batch = sub {
264 my $self = shift;
265 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
266 $self->get_Stream_by_id(@_)
269 =head2 get_Stream_by_query
271 Title : get_Stream_by_query
272 Usage : $seq = $db->get_Stream_by_query($query);
273 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
274 at a time. For large numbers of sequences, this is far superior
275 than get_Stream_by_[id/acc]().
276 Example :
277 Returns : a Bio::SeqIO stream object
278 Args : $query : An Entrez query string or a
279 Bio::DB::Query::GenBank object. It is suggested that you
280 create a Bio::DB::Query::GenBank object and get the entry
281 count before you fetch a potentially large stream.
283 =cut
285 sub get_Stream_by_query {
286 my ($self, $query) = @_;
287 unless (ref $query && $query->can('query')) {
288 $query = Bio::DB::Query::GenBank->new($query);
290 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
293 =head2 postprocess_data
295 Title : postprocess_data
296 Usage : $self->postprocess_data ( 'type' => 'string',
297 'location' => \$datastr);
298 Function: process downloaded data before loading into a Bio::SeqIO
299 Returns : void
300 Args : hash with two keys - 'type' can be 'string' or 'file'
301 - 'location' either file location or string
302 reference containing data
304 =cut
306 # the default method, works for genbank/genpept, other classes should
307 # override it with their own method.
309 sub postprocess_data {
310 # retain this in case postprocessing is needed at a future date
314 =head2 request_format
316 Title : request_format
317 Usage : my ($req_format, $ioformat) = $self->request_format;
318 $self->request_format("genbank");
319 $self->request_format("fasta");
320 Function: Get/Set sequence format retrieval. The get-form will normally not
321 be used outside of this and derived modules.
322 Returns : Array of two strings, the first representing the format for
323 retrieval, and the second specifying the corresponding SeqIO format.
324 Args : $format = sequence format
326 =cut
328 sub request_format {
329 my ($self, $value) = @_;
330 if( defined $value ) {
331 $value = lc $value;
332 if( defined $FORMATMAP{$value} ) {
333 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
334 } else {
335 # Try to fall back to a default. Alternatively, we could throw
336 # an exception
337 $self->{'_format'} = [ $value, $value ];
340 return @{$self->{'_format'}};
343 =head2 redirect_refseq
345 Title : redirect_refseq
346 Usage : $db->redirect_refseq(1)
347 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
348 Returns : Boolean value
349 Args : Boolean value (optional)
350 Throws : 'unparseable output exception'
351 Note : This replaces 'no_redirect' as a more straightforward flag to
352 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
353 instead of retrievign the NCBI records
355 =cut
357 sub redirect_refseq {
358 my $self = shift;
359 return $self->{'_redirect_refseq'} = shift if @_;
360 return $self->{'_redirect_refseq'};
363 =head2 complexity
365 Title : complexity
366 Usage : $db->complexity(3)
367 Function: get/set complexity value
368 Returns : value from 0-4 indicating level of complexity
369 Args : value from 0-4 (optional); if unset server assumes 1
370 Throws : if arg is not an integer or falls outside of noted range above
371 Note : From efetch docs:
373 Complexity regulates the display:
375 * 0 - get the whole blob
376 * 1 - get the bioseq for gi of interest (default in Entrez)
377 * 2 - get the minimal bioseq-set containing the gi of interest
378 * 3 - get the minimal nuc-prot containing the gi of interest
379 * 4 - get the minimal pub-set containing the gi of interest
381 =cut
383 sub complexity {
384 my ($self, $comp) = @_;
385 if (defined $comp) {
386 $self->throw("Complexity value must be integer between 0 and 4") if
387 $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
388 $self->{'_complexity'} = $comp;
390 return $self->{'_complexity'};
393 =head2 strand
395 Title : strand
396 Usage : $db->strand(1)
397 Function: get/set strand value
398 Returns : strand value if set
399 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
400 Throws : if arg is not an integer or is not 1 or 2
401 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
402 We should probably add in some functionality to convert over in the future.
404 =cut
406 sub strand {
407 my ($self, $str) = @_;
408 if ($str) {
409 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
410 $str !~ /^\d+$/ || $str < 1 || $str > 2;
411 $self->{'_strand'} = $str;
413 return $self->{'_strand'};
416 =head2 seq_start
418 Title : seq_start
419 Usage : $db->seq_start(123)
420 Function: get/set sequence start location
421 Returns : sequence start value if set
422 Args : integer; if unset server assumes 1
423 Throws : if arg is not an integer
425 =cut
427 sub seq_start {
428 my ($self, $start) = @_;
429 if ($start) {
430 $self->throw("seq_start() must be integer value if set") if
431 $start !~ /^\d+$/;
432 $self->{'_seq_start'} = $start;
434 return $self->{'_seq_start'};
437 =head2 seq_stop
439 Title : seq_stop
440 Usage : $db->seq_stop(456)
441 Function: get/set sequence stop (end) location
442 Returns : sequence stop (end) value if set
443 Args : integer; if unset server assumes 1
444 Throws : if arg is not an integer
446 =cut
448 sub seq_stop {
449 my ($self, $stop) = @_;
450 if ($stop) {
451 $self->throw("seq_stop() must be integer if set") if
452 $stop !~ /^\d+$/;
453 $self->{'_seq_stop'} = $stop;
455 return $self->{'_seq_stop'};
458 =head2 Bio::DB::WebDBSeqI methods
460 Overriding WebDBSeqI method to help newbies to retrieve sequences
462 =head2 get_Stream_by_acc
464 Title : get_Stream_by_acc
465 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
466 Function: Gets a series of Seq objects by accession numbers
467 Returns : a Bio::SeqIO stream object
468 Args : $ref : a reference to an array of accession numbers for
469 the desired sequence entries
470 Note : For GenBank, this just calls the same code for get_Stream_by_id()
472 =cut
474 sub get_Stream_by_acc {
475 my ($self, $ids ) = @_;
476 my $newdb = $self->_check_id($ids);
477 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
478 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
479 } else {
480 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
485 =head2 _check_id
487 Title : _check_id
488 Usage :
489 Function:
490 Returns : A Bio::DB::RefSeq reference or throws
491 Args : $id(s), $string
493 =cut
495 sub _check_id {
496 my ($self, $ids) = @_;
498 # NT contigs can not be retrieved
499 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
500 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
501 if $ids =~ /NT_/;
503 # Asking for a RefSeq from EMBL/GenBank
505 if ($self->redirect_refseq) {
506 if ($ids =~ /N._/) {
507 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
508 " Redirecting the request.\n")
509 if $self->verbose >= 0;
510 return Bio::DB::RefSeq->new();
515 =head2 delay_policy
517 Title : delay_policy
518 Usage : $secs = $self->delay_policy
519 Function: return number of seconds to delay between calls to remote db
520 Returns : number of seconds to delay
521 Args : none
523 NOTE: NCBI requests a delay of 3 seconds between requests. This method
524 implements that policy.
526 =cut
528 sub delay_policy {
529 my $self = shift;
530 return 3;
533 =head2 cookie
535 Title : cookie
536 Usage : ($cookie,$querynum) = $db->cookie
537 Function: return the NCBI query cookie
538 Returns : list of (cookie,querynum)
539 Args : none
541 NOTE: this information is used by Bio::DB::GenBank in
542 conjunction with efetch.
544 =cut
546 # ripped from Bio::DB::Query::GenBank
547 sub cookie {
548 my $self = shift;
549 if (@_) {
550 $self->{'_cookie'} = shift;
551 $self->{'_querynum'} = shift;
553 else {
554 return @{$self}{qw(_cookie _querynum)};
558 =head2 _parse_response
560 Title : _parse_response
561 Usage : $db->_parse_response($content)
562 Function: parse out response for cookie
563 Returns : empty
564 Args : none
565 Throws : 'unparseable output exception'
567 =cut
569 # trimmed-down version of _parse_response from Bio::DB::Query::GenBank
570 sub _parse_response {
571 my $self = shift;
572 my $content = shift;
573 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
574 $self->warn("Warning(s) from GenBank: $warning\n");
576 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
577 $self->throw("Error from Genbank: $error");
579 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
580 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
581 $self->cookie(uri_unescape($cookie),$querykey);
584 ########### DEPRECATED!!!! ###########
586 =head2 no_redirect
588 Title : no_redirect
589 Usage : $db->no_redirect($content)
590 Function: Used to indicate that Bio::DB::GenBank instance retrieves
591 possible RefSeqs from EBI instead; default behavior is now to
592 retrieve directly from NCBI
593 Returns : None
594 Args : None
595 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
597 =cut
599 sub no_redirect {
600 shift->throw(
601 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
602 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
603 "redirect_refseq flag to 1");
608 __END__