MUSIGHBA1 no longer works as a primary ID, LOCUS apparently is not indexed; fixing
[bioperl-live.git] / Bio / DB / NCBIHelper.pm
blobe9f9d30086811dba1e528687fc1a8bcc1839cdce
2 # BioPerl module for Bio::DB::NCBIHelper
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # Interfaces with new WebDBSeqI interface
16 =head1 NAME
18 Bio::DB::NCBIHelper - A collection of routines useful for queries to
19 NCBI databases.
21 =head1 SYNOPSIS
23 # Do not use this module directly.
25 # get a Bio::DB::NCBIHelper object somehow
26 my $seqio = $db->get_Stream_by_acc(['J00522']);
27 foreach my $seq ( $seqio->next_seq ) {
28 # process seq
31 =head1 DESCRIPTION
33 Provides a single place to setup some common methods for querying NCBI
34 web databases. This module just centralizes the methods for
35 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
36 common HTML stripping done in L<postprocess_data>().
38 The base NCBI query URL used is:
39 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
41 =head1 FEEDBACK
43 =head2 Mailing Lists
45 User feedback is an integral part of the
46 evolution of this and other Bioperl modules. Send
47 your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation
49 is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to
68 help us keep track the bugs and their resolution.
69 Bug reports can be submitted via the web.
71 https://redmine.open-bio.org/projects/bioperl/
73 =head1 AUTHOR - Jason Stajich
75 Email jason@bioperl.org
77 =head1 APPENDIX
79 The rest of the documentation details each of the
80 object methods. Internal methods are usually
81 preceded with a _
83 =cut
85 # Let the code begin...
87 package Bio::DB::NCBIHelper;
88 use strict;
89 use vars qw($HOSTBASE %CGILOCATION %FORMATMAP $DEFAULTFORMAT $MAX_ENTRIES);
91 use Bio::DB::Query::GenBank;
92 use HTTP::Request::Common;
93 use URI;
94 use Bio::Root::IO;
95 use Bio::DB::RefSeq;
96 use URI::Escape qw(uri_unescape);
98 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
100 BEGIN {
101 $MAX_ENTRIES = 19000;
102 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
103 %CGILOCATION = (
104 'batch' => ['post' => '/entrez/eutils/epost.fcgi'],
105 'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
106 'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
107 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
108 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
109 'webenv' => ['get' => '/entrez/eutils/efetch.fcgi']
112 %FORMATMAP = ( 'gb' => 'genbank',
113 'gp' => 'genbank',
114 'fasta' => 'fasta',
115 'asn.1' => 'entrezgene',
116 'gbwithparts' => 'genbank',
118 $DEFAULTFORMAT = 'gb';
121 # the new way to make modules a little more lightweight
123 sub new {
124 my ($class, @args ) = @_;
125 my $self = $class->SUPER::new(@args);
126 my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
127 $self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
128 @args);
129 $seq_start && $self->seq_start($seq_start);
130 $seq_stop && $self->seq_stop($seq_stop);
131 $no_redirect && $self->no_redirect($no_redirect);
132 $redirect && $self->redirect_refseq($redirect);
133 $strand && $self->strand($strand);
134 # adjust statement to accept zero value
135 defined $complexity && ($complexity >=0 && $complexity <=4)
136 && $self->complexity($complexity);
137 return $self;
141 =head2 get_params
143 Title : get_params
144 Usage : my %params = $self->get_params($mode)
145 Function: Returns key,value pairs to be passed to NCBI database
146 for either 'batch' or 'single' sequence retrieval method
147 Returns : a key,value pair hash
148 Args : 'single' or 'batch' mode for retrieval
150 =cut
152 sub get_params {
153 my ($self, $mode) = @_;
154 $self->throw("subclass did not implement get_params");
157 =head2 default_format
159 Title : default_format
160 Usage : my $format = $self->default_format
161 Function: Returns default sequence format for this module
162 Returns : string
163 Args : none
165 =cut
167 sub default_format {
168 return $DEFAULTFORMAT;
171 =head2 get_request
173 Title : get_request
174 Usage : my $url = $self->get_request
175 Function: HTTP::Request
176 Returns :
177 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
179 =cut
181 sub get_request {
182 my ($self, @qualifiers) = @_;
183 my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
184 $self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
185 @qualifiers);
186 $mode = lc $mode;
187 ($format) = $self->request_format() unless ( defined $format);
188 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
189 my %params = $self->get_params($mode);
190 if( ! %params ) {
191 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
193 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
194 unless( $mode eq 'webenv' || defined $uids || defined $query) {
195 $self->throw("Must specify a query or list of uids to fetch");
197 if ($query && $query->can('cookie')) {
198 @params{'WebEnv','query_key'} = $query->cookie;
199 $params{'db'} = $query->db;
201 elsif ($query) {
202 $params{'id'} = join ',',$query->ids;
204 # for batch retrieval, non-query style
205 elsif ($mode eq 'webenv' && $self->can('cookie')) {
206 @params{'WebEnv','query_key'} = $self->cookie;
208 elsif ($uids) {
209 if( ref($uids) =~ /array/i ) {
210 $uids = join(",", @$uids);
212 $params{'id'} = $uids;
214 $seq_start && ($params{'seq_start'} = $seq_start);
215 $seq_stop && ($params{'seq_stop'} = $seq_stop);
216 $strand && ($params{'strand'} = $strand);
217 if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
218 $self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
219 if ($complexity != 1 && ($seq_start || $seq_stop));
220 $self->warn("Complexity set to 0; expect strange results with strand set to 2")
221 if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
223 defined $complexity && ($params{'complexity'} = $complexity);
224 $params{'rettype'} = $format unless $mode eq 'batch';
225 # for now, 'post' is batch retrieval
226 if ($CGILOCATION{$mode}[0] eq 'post') {
227 my $response = $self->ua->request(POST $url,[%params]);
228 $response->proxy_authorization_basic($self->authentication)
229 if ( $self->authentication);
230 $self->_parse_response($response->content);
231 my ($cookie, $querykey) = $self->cookie;
232 my %qualifiers = ('-mode' => 'webenv',
233 '-seq_start' => $seq_start,
234 '-seq_stop' => $seq_stop,
235 '-strand' => $strand,
236 '-complexity' => $complexity,
237 '-format' => $format);
238 return $self->get_request(%qualifiers);
239 } else {
240 $url->query_form(%params);
241 return GET $url;
245 =head2 get_Stream_by_batch
247 Title : get_Stream_by_batch
248 Usage : $seq = $db->get_Stream_by_batch($ref);
249 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
250 at a time. For large numbers of sequences, this is far superior
251 than get_Stream_by_[id/acc]().
252 Example :
253 Returns : a Bio::SeqIO stream object
254 Args : $ref : either an array reference, a filename, or a filehandle
255 from which to get the list of unique ids/accession numbers.
257 NOTE: deprecated API. Use get_Stream_by_id() instead.
259 =cut
261 *get_Stream_by_batch = sub {
262 my $self = shift;
263 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
264 $self->get_Stream_by_id(@_)
267 =head2 get_Stream_by_query
269 Title : get_Stream_by_query
270 Usage : $seq = $db->get_Stream_by_query($query);
271 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
272 at a time. For large numbers of sequences, this is far superior
273 than get_Stream_by_[id/acc]().
274 Example :
275 Returns : a Bio::SeqIO stream object
276 Args : $query : An Entrez query string or a
277 Bio::DB::Query::GenBank object. It is suggested that you
278 create a Bio::DB::Query::GenBank object and get the entry
279 count before you fetch a potentially large stream.
281 =cut
283 sub get_Stream_by_query {
284 my ($self, $query) = @_;
285 unless (ref $query && $query->can('query')) {
286 $query = Bio::DB::Query::GenBank->new($query);
288 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
291 =head2 postprocess_data
293 Title : postprocess_data
294 Usage : $self->postprocess_data ( 'type' => 'string',
295 'location' => \$datastr);
296 Function: process downloaded data before loading into a Bio::SeqIO
297 Returns : void
298 Args : hash with two keys - 'type' can be 'string' or 'file'
299 - 'location' either file location or string
300 reference containing data
302 =cut
304 # the default method, works for genbank/genpept, other classes should
305 # override it with their own method.
307 sub postprocess_data {
308 # retain this in case postprocessing is needed at a future date
312 =head2 request_format
314 Title : request_format
315 Usage : my ($req_format, $ioformat) = $self->request_format;
316 $self->request_format("genbank");
317 $self->request_format("fasta");
318 Function: Get/Set sequence format retrieval. The get-form will normally not
319 be used outside of this and derived modules.
320 Returns : Array of two strings, the first representing the format for
321 retrieval, and the second specifying the corresponding SeqIO format.
322 Args : $format = sequence format
324 =cut
326 sub request_format {
327 my ($self, $value) = @_;
328 if( defined $value ) {
329 $value = lc $value;
330 if( defined $FORMATMAP{$value} ) {
331 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
332 } else {
333 # Try to fall back to a default. Alternatively, we could throw
334 # an exception
335 $self->{'_format'} = [ $value, $value ];
338 return @{$self->{'_format'}};
341 =head2 redirect_refseq
343 Title : redirect_refseq
344 Usage : $db->redirect_refseq(1)
345 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
346 Returns : Boolean value
347 Args : Boolean value (optional)
348 Throws : 'unparseable output exception'
349 Note : This replaces 'no_redirect' as a more straightforward flag to
350 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
351 instead of retrievign the NCBI records
353 =cut
355 sub redirect_refseq {
356 my $self = shift;
357 return $self->{'_redirect_refseq'} = shift if @_;
358 return $self->{'_redirect_refseq'};
361 =head2 complexity
363 Title : complexity
364 Usage : $db->complexity(3)
365 Function: get/set complexity value
366 Returns : value from 0-4 indicating level of complexity
367 Args : value from 0-4 (optional); if unset server assumes 1
368 Throws : if arg is not an integer or falls outside of noted range above
369 Note : From efetch docs:
371 Complexity regulates the display:
373 * 0 - get the whole blob
374 * 1 - get the bioseq for gi of interest (default in Entrez)
375 * 2 - get the minimal bioseq-set containing the gi of interest
376 * 3 - get the minimal nuc-prot containing the gi of interest
377 * 4 - get the minimal pub-set containing the gi of interest
379 =cut
381 sub complexity {
382 my ($self, $comp) = @_;
383 if (defined $comp) {
384 $self->throw("Complexity value must be integer between 0 and 4") if
385 $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
386 $self->{'_complexity'} = $comp;
388 return $self->{'_complexity'};
391 =head2 strand
393 Title : strand
394 Usage : $db->strand(1)
395 Function: get/set strand value
396 Returns : strand value if set
397 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
398 Throws : if arg is not an integer or is not 1 or 2
399 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
400 We should probably add in some functionality to convert over in the future.
402 =cut
404 sub strand {
405 my ($self, $str) = @_;
406 if ($str) {
407 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
408 $str !~ /^\d+$/ || $str < 1 || $str > 2;
409 $self->{'_strand'} = $str;
411 return $self->{'_strand'};
414 =head2 seq_start
416 Title : seq_start
417 Usage : $db->seq_start(123)
418 Function: get/set sequence start location
419 Returns : sequence start value if set
420 Args : integer; if unset server assumes 1
421 Throws : if arg is not an integer
423 =cut
425 sub seq_start {
426 my ($self, $start) = @_;
427 if ($start) {
428 $self->throw("seq_start() must be integer value if set") if
429 $start !~ /^\d+$/;
430 $self->{'_seq_start'} = $start;
432 return $self->{'_seq_start'};
435 =head2 seq_stop
437 Title : seq_stop
438 Usage : $db->seq_stop(456)
439 Function: get/set sequence stop (end) location
440 Returns : sequence stop (end) value if set
441 Args : integer; if unset server assumes 1
442 Throws : if arg is not an integer
444 =cut
446 sub seq_stop {
447 my ($self, $stop) = @_;
448 if ($stop) {
449 $self->throw("seq_stop() must be integer if set") if
450 $stop !~ /^\d+$/;
451 $self->{'_seq_stop'} = $stop;
453 return $self->{'_seq_stop'};
456 =head2 Bio::DB::WebDBSeqI methods
458 Overriding WebDBSeqI method to help newbies to retrieve sequences
460 =head2 get_Stream_by_acc
462 Title : get_Stream_by_acc
463 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
464 Function: Gets a series of Seq objects by accession numbers
465 Returns : a Bio::SeqIO stream object
466 Args : $ref : a reference to an array of accession numbers for
467 the desired sequence entries
468 Note : For GenBank, this just calls the same code for get_Stream_by_id()
470 =cut
472 sub get_Stream_by_acc {
473 my ($self, $ids ) = @_;
474 my $newdb = $self->_check_id($ids);
475 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
476 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
477 } else {
478 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
483 =head2 _check_id
485 Title : _check_id
486 Usage :
487 Function:
488 Returns : A Bio::DB::RefSeq reference or throws
489 Args : $id(s), $string
491 =cut
493 sub _check_id {
494 my ($self, $ids) = @_;
496 # NT contigs can not be retrieved
497 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
498 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
499 if $ids =~ /NT_/;
501 # Asking for a RefSeq from EMBL/GenBank
503 if ($self->redirect_refseq) {
504 if ($ids =~ /N._/) {
505 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
506 " Redirecting the request.\n")
507 if $self->verbose >= 0;
508 return Bio::DB::RefSeq->new();
513 =head2 delay_policy
515 Title : delay_policy
516 Usage : $secs = $self->delay_policy
517 Function: return number of seconds to delay between calls to remote db
518 Returns : number of seconds to delay
519 Args : none
521 NOTE: NCBI requests a delay of 3 seconds between requests. This method
522 implements that policy.
524 =cut
526 sub delay_policy {
527 my $self = shift;
528 return 3;
531 =head2 cookie
533 Title : cookie
534 Usage : ($cookie,$querynum) = $db->cookie
535 Function: return the NCBI query cookie
536 Returns : list of (cookie,querynum)
537 Args : none
539 NOTE: this information is used by Bio::DB::GenBank in
540 conjunction with efetch.
542 =cut
544 # ripped from Bio::DB::Query::GenBank
545 sub cookie {
546 my $self = shift;
547 if (@_) {
548 $self->{'_cookie'} = shift;
549 $self->{'_querynum'} = shift;
551 else {
552 return @{$self}{qw(_cookie _querynum)};
556 =head2 _parse_response
558 Title : _parse_response
559 Usage : $db->_parse_response($content)
560 Function: parse out response for cookie
561 Returns : empty
562 Args : none
563 Throws : 'unparseable output exception'
565 =cut
567 # trimmed-down version of _parse_response from Bio::DB::Query::GenBank
568 sub _parse_response {
569 my $self = shift;
570 my $content = shift;
571 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
572 $self->warn("Warning(s) from GenBank: $warning\n");
574 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
575 $self->throw("Error from Genbank: $error");
577 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
578 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
579 $self->cookie(uri_unescape($cookie),$querykey);
582 ########### DEPRECATED!!!! ###########
584 =head2 no_redirect
586 Title : no_redirect
587 Usage : $db->no_redirect($content)
588 Function: Used to indicate that Bio::DB::GenBank instance retrieves
589 possible RefSeqs from EBI instead; default behavior is now to
590 retrieve directly from NCBI
591 Returns : None
592 Args : None
593 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
595 =cut
597 sub no_redirect {
598 shift->throw(
599 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
600 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
601 "redirect_refseq flag to 1");
606 __END__