2 # BioPerl module for Bio::DB::NCBIHelper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # Interfaces with new WebDBSeqI interface
18 Bio::DB::NCBIHelper - A collection of routines useful for queries to
23 # Do not use this module directly.
25 # get a Bio::DB::NCBIHelper object somehow
26 my $seqio = $db->get_Stream_by_acc(['J00522']);
27 foreach my $seq ( $seqio->next_seq ) {
33 Provides a single place to setup some common methods for querying NCBI
34 web databases. This module just centralizes the methods for
35 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
36 common HTML stripping done in L<postprocess_data>().
38 The base NCBI query URL used is:
39 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
45 User feedback is an integral part of the
46 evolution of this and other Bioperl modules. Send
47 your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
67 Report bugs to the Bioperl bug tracking system to
68 help us keep track the bugs and their resolution.
69 Bug reports can be submitted via the web.
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Jason Stajich
75 Email jason@bioperl.org
79 The rest of the documentation details each of the
80 object methods. Internal methods are usually
85 # Let the code begin...
87 package Bio
::DB
::NCBIHelper
;
90 use Bio
::DB
::Query
::GenBank
;
91 use HTTP
::Request
::Common
;
95 use URI
::Escape
qw(uri_unescape);
97 use base
qw(Bio::DB::WebDBSeqI Bio::Root::Root);
99 our $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
100 our $MAX_ENTRIES = 19000;
101 our $REQUEST_DELAY = 3;
103 'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
104 'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
105 'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
106 'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
107 'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
108 'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ]
114 'asn.1' => 'entrezgene',
115 'gbwithparts' => 'genbank',
117 our $DEFAULTFORMAT = 'gb';
123 Function: the new way to make modules a little more lightweight
130 my ( $class, @args ) = @_;
131 my $self = $class->SUPER::new
(@args);
132 my ($seq_start, $seq_stop, $no_redirect,
133 $redirect, $complexity, $strand
136 [ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ],
139 $seq_start && $self->seq_start($seq_start);
140 $seq_stop && $self->seq_stop($seq_stop);
141 $no_redirect && $self->no_redirect($no_redirect);
142 $redirect && $self->redirect_refseq($redirect);
143 $strand && $self->strand($strand);
145 # adjust statement to accept zero value
147 && ( $complexity >= 0 && $complexity <= 4 )
148 && $self->complexity($complexity);
156 Usage : my %params = $self->get_params($mode)
157 Function: returns key,value pairs to be passed to NCBI database
158 for either 'batch' or 'single' sequence retrieval method
159 Returns : a key,value pair hash
160 Args : 'single' or 'batch' mode for retrieval
165 my ($self, $mode) = @_;
166 $self->throw("subclass did not implement get_params");
169 =head2 default_format
171 Title : default_format
172 Usage : my $format = $self->default_format
173 Function: returns default sequence format for this module
180 return $DEFAULTFORMAT;
186 Usage : my $url = $self->get_request
187 Function: HTTP::Request
189 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
194 my ( $self, @qualifiers ) = @_;
195 my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand,
198 [qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
201 ($format) = $self->request_format() unless ( defined $format );
202 if ( !defined $mode || $mode eq '' ) { $mode = 'single'; }
203 my %params = $self->get_params($mode);
206 "must specify a valid retrieval mode 'single' or 'batch' not '$mode'"
209 my $url = URI
->new( $HOSTBASE . $CGILOCATION{$mode}[1] );
210 unless ( $mode eq 'webenv' || defined $uids || defined $query ) {
211 $self->throw("Must specify a query or list of uids to fetch");
213 if ( $query && $query->can('cookie') ) {
214 @params{ 'WebEnv', 'query_key' } = $query->cookie;
215 $params{'db'} = $query->db;
218 $params{'id'} = join ',', $query->ids;
221 # for batch retrieval, non-query style
222 elsif ( $mode eq 'webenv' && $self->can('cookie') ) {
223 @params{ 'WebEnv', 'query_key' } = $self->cookie;
226 if ( ref($uids) =~ /array/i ) {
227 $uids = join( ",", @
$uids );
229 $params{'id'} = $uids;
231 $seq_start && ( $params{'seq_start'} = $seq_start );
232 $seq_stop && ( $params{'seq_stop'} = $seq_stop );
233 $strand && ( $params{'strand'} = $strand );
234 if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) {
236 "Complexity set to $complexity; seq_start and seq_stop may not work!"
237 ) if ( $complexity != 1 && ( $seq_start || $seq_stop ) );
239 "Complexity set to 0; expect strange results with strand set to 2"
240 ) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' );
242 defined $complexity && ( $params{'complexity'} = $complexity );
243 $params{'rettype'} = $format unless $mode eq 'batch';
245 # for now, 'post' is batch retrieval
246 if ( $CGILOCATION{$mode}[0] eq 'post' ) {
247 my $response = $self->ua->request( POST
$url, [%params] );
248 $response->proxy_authorization_basic( $self->authentication )
249 if ( $self->authentication );
250 $self->_parse_response( $response->content );
251 my ( $cookie, $querykey ) = $self->cookie;
254 '-seq_start' => $seq_start,
255 '-seq_stop' => $seq_stop,
256 '-strand' => $strand,
257 '-complexity' => $complexity,
260 return $self->get_request(%qualifiers);
263 $url->query_form(%params);
269 =head2 get_Stream_by_batch
271 Title : get_Stream_by_batch
272 Usage : $seq = $db->get_Stream_by_batch($ref);
273 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
274 at a time. For large numbers of sequences, this is far superior
275 than get_Stream_by_id or get_Stream_by_acc.
277 Returns : a Bio::SeqIO stream object
278 Args : $ref : either an array reference, a filename, or a filehandle
279 from which to get the list of unique ids/accession numbers.
281 NOTE: deprecated API. Use get_Stream_by_id() instead.
285 *get_Stream_by_batch
= sub {
287 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
288 $self->get_Stream_by_id(@_)
291 =head2 get_Stream_by_query
293 Title : get_Stream_by_query
294 Usage : $seq = $db->get_Stream_by_query($query);
295 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
296 at a time. For large numbers of sequences, this is far superior
297 to get_Stream_by_id and get_Stream_by_acc.
299 Returns : a Bio::SeqIO stream object
300 Args : An Entrez query string or a Bio::DB::Query::GenBank object.
301 It is suggested that you create a Bio::DB::Query::GenBank object and get
302 the entry count before you fetch a potentially large stream.
306 sub get_Stream_by_query
{
307 my ($self, $query) = @_;
308 unless (ref $query && $query->can('query')) {
309 $query = Bio
::DB
::Query
::GenBank
->new($query);
311 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
314 =head2 postprocess_data
316 Title : postprocess_data
317 Usage : $self->postprocess_data ( 'type' => 'string',
318 'location' => \$datastr );
319 Function: Process downloaded data before loading into a Bio::SeqIO. This
320 works for Genbank and Genpept, other classes should override
321 it with their own method.
323 Args : hash with two keys:
325 'type' can be 'string' or 'file'
326 'location' either file location or string reference containing data
330 sub postprocess_data
{
331 # retain this in case postprocessing is needed at a future date
335 =head2 request_format
337 Title : request_format
338 Usage : my ($req_format, $ioformat) = $self->request_format;
339 $self->request_format("genbank");
340 $self->request_format("fasta");
341 Function: Get/Set sequence format retrieval. The get-form will normally not
342 be used outside of this and derived modules.
343 Returns : Array of two strings, the first representing the format for
344 retrieval, and the second specifying the corresponding SeqIO format.
345 Args : $format = sequence format
350 my ( $self, $value ) = @_;
351 if ( defined $value ) {
353 if ( defined $FORMATMAP{$value} ) {
354 $self->{'_format'} = [ $value, $FORMATMAP{$value} ];
357 # Try to fall back to a default. Alternatively, we could throw
359 $self->{'_format'} = [ $value, $value ];
362 return @
{ $self->{'_format'} };
366 =head2 redirect_refseq
368 Title : redirect_refseq
369 Usage : $db->redirect_refseq(1)
370 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
371 Returns : Boolean value
372 Args : Boolean value (optional)
373 Throws : 'unparseable output exception'
374 Note : This replaces 'no_redirect' as a more straightforward flag to
375 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
376 instead of retrieving the NCBI records
380 sub redirect_refseq
{
382 return $self->{'_redirect_refseq'} = shift if @_;
383 return $self->{'_redirect_refseq'};
389 Usage : $db->complexity(3)
390 Function: get/set complexity value
391 Returns : value from 0-4 indicating level of complexity
392 Args : value from 0-4 (optional); if unset server assumes 1
393 Throws : if arg is not an integer or falls outside of noted range above
394 Note : From efetch docs, the complexity regulates the display:
396 0 - get the whole blob
397 1 - get the bioseq for gi of interest (default in Entrez)
398 2 - get the minimal bioseq-set containing the gi of interest
399 3 - get the minimal nuc-prot containing the gi of interest
400 4 - get the minimal pub-set containing the gi of interest
405 my ( $self, $comp ) = @_;
406 if ( defined $comp ) {
407 $self->throw("Complexity value must be integer between 0 and 4")
408 if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
409 $self->{'_complexity'} = $comp;
411 return $self->{'_complexity'};
417 Usage : $db->strand(1)
418 Function: get/set strand value
419 Returns : strand value if set
420 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
421 Throws : if arg is not an integer or is not 1 or 2
422 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
423 We should probably add in some functionality to convert over in the future.
428 my ($self, $str) = @_;
430 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
431 $str !~ /^\d+$/ || $str < 1 || $str > 2;
432 $self->{'_strand'} = $str;
434 return $self->{'_strand'};
440 Usage : $db->seq_start(123)
441 Function: get/set sequence start location
442 Returns : sequence start value if set
443 Args : integer; if unset server assumes 1
444 Throws : if arg is not an integer
449 my ($self, $start) = @_;
451 $self->throw("seq_start() must be integer value if set") if
453 $self->{'_seq_start'} = $start;
455 return $self->{'_seq_start'};
461 Usage : $db->seq_stop(456)
462 Function: get/set sequence stop (end) location
463 Returns : sequence stop (end) value if set
464 Args : integer; if unset server assumes 1
465 Throws : if arg is not an integer
470 my ($self, $stop) = @_;
472 $self->throw("seq_stop() must be integer if set") if
474 $self->{'_seq_stop'} = $stop;
476 return $self->{'_seq_stop'};
479 =head2 Bio::DB::WebDBSeqI methods
481 Overriding WebDBSeqI method to help newbies to retrieve sequences
483 =head2 get_Stream_by_acc
485 Title : get_Stream_by_acc
486 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
487 Function: gets a series of Seq objects by accession numbers
488 Returns : a Bio::SeqIO stream object
489 Args : $ref : a reference to an array of accession numbers for
490 the desired sequence entries
491 Note : For GenBank, this just calls the same code for get_Stream_by_id()
495 sub get_Stream_by_acc
{
496 my ( $self, $ids ) = @_;
497 my $newdb = $self->_check_id($ids);
498 if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) {
499 return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
502 return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
511 Returns : a Bio::DB::RefSeq reference or throws
512 Args : $id(s), $string
517 my ( $self, $ids ) = @_;
519 # NT contigs can not be retrieved
520 $self->throw("NT_ contigs are whole chromosome files which are not part of regular"
521 . "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
524 # Asking for a RefSeq from EMBL/GenBank
525 if ( $self->redirect_refseq ) {
526 if ( $ids =~ /N._/ ) {
528 "[$ids] is not a normal sequence database but a RefSeq entry."
529 . " Redirecting the request.\n" )
530 if $self->verbose >= 0;
531 return Bio
::DB
::RefSeq
->new();
540 Usage : $secs = $self->delay_policy
541 Function: NCBI requests a delay of 3 seconds between requests. This method
542 implements that policy.
543 Returns : number of seconds to delay
550 return $REQUEST_DELAY;
556 Usage : ($cookie,$querynum) = $db->cookie
557 Function: return the NCBI query cookie, this information is used by
558 Bio::DB::GenBank in conjunction with efetch, ripped from
559 Bio::DB::Query::GenBank
560 Returns : list of (cookie,querynum)
568 $self->{'_cookie'} = shift;
569 $self->{'_querynum'} = shift;
572 return @
{$self}{qw(_cookie _querynum)};
576 =head2 _parse_response
578 Title : _parse_response
579 Usage : $db->_parse_response($content)
580 Function: parse out response for cookie, this is a trimmed-down version
581 of _parse_response from Bio::DB::Query::GenBank
584 Throws : 'unparseable output exception'
588 sub _parse_response
{
591 if ( my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s ) {
592 $self->warn("Warning(s) from GenBank: $warning\n");
594 if ( my ($error) = $content =~ /<OutputMessage>([^<]+)/ ) {
595 $self->throw("Error from Genbank: $error");
597 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
598 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
599 $self->cookie( uri_unescape
($cookie), $querykey );
605 Usage : $db->no_redirect($content)
606 Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves
607 possible RefSeqs from EBI instead; default behavior is now to
608 retrieve directly from NCBI
611 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
617 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
618 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
619 "redirect_refseq flag to 1");