Sync with main trunk
[bioperl-live.git] / Bio / DB / NCBIHelper.pm
blob7d8c6f6943bc6a75ad4d57534410d162a6aae643
1 # $Id$
3 # BioPerl module for Bio::DB::NCBIHelper
5 # Cared for by Jason Stajich
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 # Interfaces with new WebDBSeqI interface
15 =head1 NAME
17 Bio::DB::NCBIHelper - A collection of routines useful for queries to
18 NCBI databases.
20 =head1 SYNOPSIS
22 # Do not use this module directly.
24 # get a Bio::DB::NCBIHelper object somehow
25 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']);
26 foreach my $seq ( $seqio->next_seq ) {
27 # process seq
30 =head1 DESCRIPTION
32 Provides a single place to setup some common methods for querying NCBI
33 web databases. This module just centralizes the methods for
34 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
35 common HTML stripping done in L<postprocess_data>().
37 The base NCBI query URL used is:
38 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the
45 evolution of this and other Bioperl modules. Send
46 your comments and suggestions preferably to one
47 of the Bioperl mailing lists. Your participation
48 is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 =head2 Reporting Bugs
55 Report bugs to the Bioperl bug tracking system to
56 help us keep track the bugs and their resolution.
57 Bug reports can be submitted via the web.
59 http://bugzilla.open-bio.org/
61 =head1 AUTHOR - Jason Stajich
63 Email jason@bioperl.org
65 =head1 APPENDIX
67 The rest of the documentation details each of the
68 object methods. Internal methods are usually
69 preceded with a _
71 =cut
73 # Let the code begin...
75 package Bio::DB::NCBIHelper;
76 use strict;
77 use vars qw($HOSTBASE %CGILOCATION %FORMATMAP $DEFAULTFORMAT $MAX_ENTRIES);
79 use Bio::DB::Query::GenBank;
80 use HTTP::Request::Common;
81 use URI;
82 use Bio::Root::IO;
83 use Bio::DB::RefSeq;
84 use URI::Escape qw(uri_unescape);
86 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
88 BEGIN {
89 $MAX_ENTRIES = 19000;
90 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
91 %CGILOCATION = (
92 'batch' => ['post' => '/entrez/eutils/epost.fcgi'],
93 'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
94 'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
95 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
96 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
97 'webenv' => ['get' => '/entrez/eutils/efetch.fcgi']
100 %FORMATMAP = ( 'gb' => 'genbank',
101 'gp' => 'genbank',
102 'fasta' => 'fasta',
103 'asn.1' => 'entrezgene',
104 'gbwithparts' => 'genbank',
106 $DEFAULTFORMAT = 'gb';
109 # the new way to make modules a little more lightweight
111 sub new {
112 my ($class, @args ) = @_;
113 my $self = $class->SUPER::new(@args);
114 my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
115 $self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
116 @args);
117 $seq_start && $self->seq_start($seq_start);
118 $seq_stop && $self->seq_stop($seq_stop);
119 $no_redirect && $self->no_redirect($no_redirect);
120 $redirect && $self->redirect_refseq($redirect);
121 $strand && $self->strand($strand);
122 # adjust statement to accept zero value
123 defined $complexity && ($complexity >=0 && $complexity <=4)
124 && $self->complexity($complexity);
125 return $self;
129 =head2 get_params
131 Title : get_params
132 Usage : my %params = $self->get_params($mode)
133 Function: Returns key,value pairs to be passed to NCBI database
134 for either 'batch' or 'single' sequence retrieval method
135 Returns : a key,value pair hash
136 Args : 'single' or 'batch' mode for retrieval
138 =cut
140 sub get_params {
141 my ($self, $mode) = @_;
142 $self->throw("subclass did not implement get_params");
145 =head2 default_format
147 Title : default_format
148 Usage : my $format = $self->default_format
149 Function: Returns default sequence format for this module
150 Returns : string
151 Args : none
153 =cut
155 sub default_format {
156 return $DEFAULTFORMAT;
159 =head2 get_request
161 Title : get_request
162 Usage : my $url = $self->get_request
163 Function: HTTP::Request
164 Returns :
165 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
167 =cut
169 sub get_request {
170 my ($self, @qualifiers) = @_;
171 my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
172 $self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
173 @qualifiers);
174 $mode = lc $mode;
175 ($format) = $self->request_format() unless ( defined $format);
176 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
177 my %params = $self->get_params($mode);
178 if( ! %params ) {
179 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
181 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
182 unless( $mode eq 'webenv' || defined $uids || defined $query) {
183 $self->throw("Must specify a query or list of uids to fetch");
185 if ($query && $query->can('cookie')) {
186 @params{'WebEnv','query_key'} = $query->cookie;
187 $params{'db'} = $query->db;
189 elsif ($query) {
190 $params{'id'} = join ',',$query->ids;
192 # for batch retrieval, non-query style
193 elsif ($mode eq 'webenv' && $self->can('cookie')) {
194 @params{'WebEnv','query_key'} = $self->cookie;
196 elsif ($uids) {
197 if( ref($uids) =~ /array/i ) {
198 $uids = join(",", @$uids);
200 $params{'id'} = $uids;
202 $seq_start && ($params{'seq_start'} = $seq_start);
203 $seq_stop && ($params{'seq_stop'} = $seq_stop);
204 $strand && ($params{'strand'} = $strand);
205 if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
206 $self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
207 if ($complexity != 1 && ($seq_start || $seq_stop));
208 $self->warn("Complexity set to 0; expect strange results with strand set to 2")
209 if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
211 defined $complexity && ($params{'complexity'} = $complexity);
212 $params{'rettype'} = $format unless $mode eq 'batch';
213 # for now, 'post' is batch retrieval
214 if ($CGILOCATION{$mode}[0] eq 'post') {
215 my $response = $self->ua->request(POST $url,[%params]);
216 $response->proxy_authorization_basic($self->authentication)
217 if ( $self->authentication);
218 $self->_parse_response($response->content);
219 my ($cookie, $querykey) = $self->cookie;
220 my %qualifiers = ('-mode' => 'webenv',
221 '-seq_start' => $seq_start,
222 '-seq_stop' => $seq_stop,
223 '-strand' => $strand,
224 '-complexity' => $complexity,
225 '-format' => $format);
226 return $self->get_request(%qualifiers);
227 } else {
228 $url->query_form(%params);
229 return GET $url;
233 =head2 get_Stream_by_batch
235 Title : get_Stream_by_batch
236 Usage : $seq = $db->get_Stream_by_batch($ref);
237 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
238 at a time. For large numbers of sequences, this is far superior
239 than get_Stream_by_[id/acc]().
240 Example :
241 Returns : a Bio::SeqIO stream object
242 Args : $ref : either an array reference, a filename, or a filehandle
243 from which to get the list of unique ids/accession numbers.
245 NOTE: deprecated API. Use get_Stream_by_id() instead.
247 =cut
249 *get_Stream_by_batch = sub {
250 my $self = shift;
251 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
252 $self->get_Stream_by_id(@_)
255 =head2 get_Stream_by_query
257 Title : get_Stream_by_query
258 Usage : $seq = $db->get_Stream_by_query($query);
259 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
260 at a time. For large numbers of sequences, this is far superior
261 than get_Stream_by_[id/acc]().
262 Example :
263 Returns : a Bio::SeqIO stream object
264 Args : $query : An Entrez query string or a
265 Bio::DB::Query::GenBank object. It is suggested that you
266 create a Bio::DB::Query::GenBank object and get the entry
267 count before you fetch a potentially large stream.
269 =cut
271 sub get_Stream_by_query {
272 my ($self, $query) = @_;
273 unless (ref $query && $query->can('query')) {
274 $query = Bio::DB::Query::GenBank->new($query);
276 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
279 =head2 postprocess_data
281 Title : postprocess_data
282 Usage : $self->postprocess_data ( 'type' => 'string',
283 'location' => \$datastr);
284 Function: process downloaded data before loading into a Bio::SeqIO
285 Returns : void
286 Args : hash with two keys - 'type' can be 'string' or 'file'
287 - 'location' either file location or string
288 reference containing data
290 =cut
292 # the default method, works for genbank/genpept, other classes should
293 # override it with their own method.
295 sub postprocess_data {
296 # retain this in case postprocessing is needed at a future date
300 =head2 request_format
302 Title : request_format
303 Usage : my ($req_format, $ioformat) = $self->request_format;
304 $self->request_format("genbank");
305 $self->request_format("fasta");
306 Function: Get/Set sequence format retrieval. The get-form will normally not
307 be used outside of this and derived modules.
308 Returns : Array of two strings, the first representing the format for
309 retrieval, and the second specifying the corresponding SeqIO format.
310 Args : $format = sequence format
312 =cut
314 sub request_format {
315 my ($self, $value) = @_;
316 if( defined $value ) {
317 $value = lc $value;
318 if( defined $FORMATMAP{$value} ) {
319 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
320 } else {
321 # Try to fall back to a default. Alternatively, we could throw
322 # an exception
323 $self->{'_format'} = [ $value, $value ];
326 return @{$self->{'_format'}};
329 =head2 redirect_refseq
331 Title : redirect_refseq
332 Usage : $db->redirect_refseq(1)
333 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
334 Returns : Boolean value
335 Args : Boolean value (optional)
336 Throws : 'unparseable output exception'
337 Note : This replaces 'no_redirect' as a more straightforward flag to
338 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
339 instead of retrievign the NCBI records
341 =cut
343 sub redirect_refseq {
344 my $self = shift;
345 return $self->{'_redirect_refseq'} = shift if @_;
346 return $self->{'_redirect_refseq'};
349 =head2 complexity
351 Title : complexity
352 Usage : $db->complexity(3)
353 Function: get/set complexity value
354 Returns : value from 0-4 indicating level of complexity
355 Args : value from 0-4 (optional); if unset server assumes 1
356 Throws : if arg is not an integer or falls outside of noted range above
357 Note : From efetch docs:
359 Complexity regulates the display:
361 * 0 - get the whole blob
362 * 1 - get the bioseq for gi of interest (default in Entrez)
363 * 2 - get the minimal bioseq-set containing the gi of interest
364 * 3 - get the minimal nuc-prot containing the gi of interest
365 * 4 - get the minimal pub-set containing the gi of interest
367 =cut
369 sub complexity {
370 my ($self, $comp) = @_;
371 if (defined $comp) {
372 $self->throw("Complexity value must be integer between 0 and 4") if
373 $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
374 $self->{'_complexity'} = $comp;
376 return $self->{'_complexity'};
379 =head2 strand
381 Title : strand
382 Usage : $db->strand(1)
383 Function: get/set strand value
384 Returns : strand value if set
385 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
386 Throws : if arg is not an integer or is not 1 or 2
387 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
388 We should probably add in some functionality to convert over in the future.
390 =cut
392 sub strand {
393 my ($self, $str) = @_;
394 if ($str) {
395 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
396 $str !~ /^\d+$/ || $str < 1 || $str > 2;
397 $self->{'_strand'} = $str;
399 return $self->{'_strand'};
402 =head2 seq_start
404 Title : seq_start
405 Usage : $db->seq_start(123)
406 Function: get/set sequence start location
407 Returns : sequence start value if set
408 Args : integer; if unset server assumes 1
409 Throws : if arg is not an integer
411 =cut
413 sub seq_start {
414 my ($self, $start) = @_;
415 if ($start) {
416 $self->throw("seq_start() must be integer value if set") if
417 $start !~ /^\d+$/;
418 $self->{'_seq_start'} = $start;
420 return $self->{'_seq_start'};
423 =head2 seq_stop
425 Title : seq_stop
426 Usage : $db->seq_stop(456)
427 Function: get/set sequence stop (end) location
428 Returns : sequence stop (end) value if set
429 Args : integer; if unset server assumes 1
430 Throws : if arg is not an integer
432 =cut
434 sub seq_stop {
435 my ($self, $stop) = @_;
436 if ($stop) {
437 $self->throw("seq_stop() must be integer if set") if
438 $stop !~ /^\d+$/;
439 $self->{'_seq_stop'} = $stop;
441 return $self->{'_seq_stop'};
444 =head2 Bio::DB::WebDBSeqI methods
446 Overriding WebDBSeqI method to help newbies to retrieve sequences
448 =head2 get_Stream_by_acc
450 Title : get_Stream_by_acc
451 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
452 Function: Gets a series of Seq objects by accession numbers
453 Returns : a Bio::SeqIO stream object
454 Args : $ref : a reference to an array of accession numbers for
455 the desired sequence entries
456 Note : For GenBank, this just calls the same code for get_Stream_by_id()
458 =cut
460 sub get_Stream_by_acc {
461 my ($self, $ids ) = @_;
462 my $newdb = $self->_check_id($ids);
463 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
464 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
465 } else {
466 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
471 =head2 _check_id
473 Title : _check_id
474 Usage :
475 Function:
476 Returns : A Bio::DB::RefSeq reference or throws
477 Args : $id(s), $string
479 =cut
481 sub _check_id {
482 my ($self, $ids) = @_;
484 # NT contigs can not be retrieved
485 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
486 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
487 if $ids =~ /NT_/;
489 # Asking for a RefSeq from EMBL/GenBank
491 if ($self->redirect_refseq) {
492 if ($ids =~ /N._/) {
493 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
494 " Redirecting the request.\n")
495 if $self->verbose >= 0;
496 return Bio::DB::RefSeq->new();
501 =head2 delay_policy
503 Title : delay_policy
504 Usage : $secs = $self->delay_policy
505 Function: return number of seconds to delay between calls to remote db
506 Returns : number of seconds to delay
507 Args : none
509 NOTE: NCBI requests a delay of 3 seconds between requests. This method
510 implements that policy.
512 =cut
514 sub delay_policy {
515 my $self = shift;
516 return 3;
519 =head2 cookie
521 Title : cookie
522 Usage : ($cookie,$querynum) = $db->cookie
523 Function: return the NCBI query cookie
524 Returns : list of (cookie,querynum)
525 Args : none
527 NOTE: this information is used by Bio::DB::GenBank in
528 conjunction with efetch.
530 =cut
532 # ripped from Bio::DB::Query::GenBank
533 sub cookie {
534 my $self = shift;
535 if (@_) {
536 $self->{'_cookie'} = shift;
537 $self->{'_querynum'} = shift;
539 else {
540 return @{$self}{qw(_cookie _querynum)};
544 =head2 _parse_response
546 Title : _parse_response
547 Usage : $db->_parse_response($content)
548 Function: parse out response for cookie
549 Returns : empty
550 Args : none
551 Throws : 'unparseable output exception'
553 =cut
555 # trimmed-down version of _parse_response from Bio::DB::Query::GenBank
556 sub _parse_response {
557 my $self = shift;
558 my $content = shift;
559 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
560 $self->warn("Warning(s) from GenBank: $warning\n");
562 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
563 $self->throw("Error from Genbank: $error");
565 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
566 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
567 $self->cookie(uri_unescape($cookie),$querykey);
570 ########### DEPRECATED!!!! ###########
572 =head2 no_redirect
574 Title : no_redirect
575 Usage : $db->no_redirect($content)
576 Function: Used to indicate that Bio::DB::GenBank instance retrieves
577 possible RefSeqs from EBI instead; default behavior is now to
578 retrieve directly from NCBI
579 Returns : None
580 Args : None
581 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
583 =cut
585 sub no_redirect {
586 shift->throw(
587 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
588 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
589 "redirect_refseq flag to 1");
594 __END__