tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / NCBIHelper.pm
blob91e42faab12b0fb8e3890b635df64854021a3026
1 # $Id$
3 # BioPerl module for Bio::DB::NCBIHelper
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 # Interfaces with new WebDBSeqI interface
17 =head1 NAME
19 Bio::DB::NCBIHelper - A collection of routines useful for queries to
20 NCBI databases.
22 =head1 SYNOPSIS
24 # Do not use this module directly.
26 # get a Bio::DB::NCBIHelper object somehow
27 my $seqio = $db->get_Stream_by_acc(['MUSIGHBA1']);
28 foreach my $seq ( $seqio->next_seq ) {
29 # process seq
32 =head1 DESCRIPTION
34 Provides a single place to setup some common methods for querying NCBI
35 web databases. This module just centralizes the methods for
36 constructing a URL for querying NCBI GenBank and NCBI GenPept and the
37 common HTML stripping done in L<postprocess_data>().
39 The base NCBI query URL used is:
40 http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the
47 evolution of this and other Bioperl modules. Send
48 your comments and suggestions preferably to one
49 of the Bioperl mailing lists. Your participation
50 is much appreciated.
52 bioperl-l@bioperl.org - General discussion
53 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 =head2 Support
57 Please direct usage questions or support issues to the mailing list:
59 I<bioperl-l@bioperl.org>
61 rather than to the module maintainer directly. Many experienced and
62 reponsive experts will be able look at the problem and quickly
63 address it. Please include a thorough description of the problem
64 with code and data examples if at all possible.
66 =head2 Reporting Bugs
68 Report bugs to the Bioperl bug tracking system to
69 help us keep track the bugs and their resolution.
70 Bug reports can be submitted via the web.
72 http://bugzilla.open-bio.org/
74 =head1 AUTHOR - Jason Stajich
76 Email jason@bioperl.org
78 =head1 APPENDIX
80 The rest of the documentation details each of the
81 object methods. Internal methods are usually
82 preceded with a _
84 =cut
86 # Let the code begin...
88 package Bio::DB::NCBIHelper;
89 use strict;
90 use vars qw($HOSTBASE %CGILOCATION %FORMATMAP $DEFAULTFORMAT $MAX_ENTRIES);
92 use Bio::DB::Query::GenBank;
93 use HTTP::Request::Common;
94 use URI;
95 use Bio::Root::IO;
96 use Bio::DB::RefSeq;
97 use URI::Escape qw(uri_unescape);
99 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
101 BEGIN {
102 $MAX_ENTRIES = 19000;
103 $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
104 %CGILOCATION = (
105 'batch' => ['post' => '/entrez/eutils/epost.fcgi'],
106 'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
107 'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
108 'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
109 'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
110 'webenv' => ['get' => '/entrez/eutils/efetch.fcgi']
113 %FORMATMAP = ( 'gb' => 'genbank',
114 'gp' => 'genbank',
115 'fasta' => 'fasta',
116 'asn.1' => 'entrezgene',
117 'gbwithparts' => 'genbank',
119 $DEFAULTFORMAT = 'gb';
122 # the new way to make modules a little more lightweight
124 sub new {
125 my ($class, @args ) = @_;
126 my $self = $class->SUPER::new(@args);
127 my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
128 $self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
129 @args);
130 $seq_start && $self->seq_start($seq_start);
131 $seq_stop && $self->seq_stop($seq_stop);
132 $no_redirect && $self->no_redirect($no_redirect);
133 $redirect && $self->redirect_refseq($redirect);
134 $strand && $self->strand($strand);
135 # adjust statement to accept zero value
136 defined $complexity && ($complexity >=0 && $complexity <=4)
137 && $self->complexity($complexity);
138 return $self;
142 =head2 get_params
144 Title : get_params
145 Usage : my %params = $self->get_params($mode)
146 Function: Returns key,value pairs to be passed to NCBI database
147 for either 'batch' or 'single' sequence retrieval method
148 Returns : a key,value pair hash
149 Args : 'single' or 'batch' mode for retrieval
151 =cut
153 sub get_params {
154 my ($self, $mode) = @_;
155 $self->throw("subclass did not implement get_params");
158 =head2 default_format
160 Title : default_format
161 Usage : my $format = $self->default_format
162 Function: Returns default sequence format for this module
163 Returns : string
164 Args : none
166 =cut
168 sub default_format {
169 return $DEFAULTFORMAT;
172 =head2 get_request
174 Title : get_request
175 Usage : my $url = $self->get_request
176 Function: HTTP::Request
177 Returns :
178 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
180 =cut
182 sub get_request {
183 my ($self, @qualifiers) = @_;
184 my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
185 $self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
186 @qualifiers);
187 $mode = lc $mode;
188 ($format) = $self->request_format() unless ( defined $format);
189 if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
190 my %params = $self->get_params($mode);
191 if( ! %params ) {
192 $self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
194 my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
195 unless( $mode eq 'webenv' || defined $uids || defined $query) {
196 $self->throw("Must specify a query or list of uids to fetch");
198 if ($query && $query->can('cookie')) {
199 @params{'WebEnv','query_key'} = $query->cookie;
200 $params{'db'} = $query->db;
202 elsif ($query) {
203 $params{'id'} = join ',',$query->ids;
205 # for batch retrieval, non-query style
206 elsif ($mode eq 'webenv' && $self->can('cookie')) {
207 @params{'WebEnv','query_key'} = $self->cookie;
209 elsif ($uids) {
210 if( ref($uids) =~ /array/i ) {
211 $uids = join(",", @$uids);
213 $params{'id'} = $uids;
215 $seq_start && ($params{'seq_start'} = $seq_start);
216 $seq_stop && ($params{'seq_stop'} = $seq_stop);
217 $strand && ($params{'strand'} = $strand);
218 if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
219 $self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
220 if ($complexity != 1 && ($seq_start || $seq_stop));
221 $self->warn("Complexity set to 0; expect strange results with strand set to 2")
222 if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
224 defined $complexity && ($params{'complexity'} = $complexity);
225 $params{'rettype'} = $format unless $mode eq 'batch';
226 # for now, 'post' is batch retrieval
227 if ($CGILOCATION{$mode}[0] eq 'post') {
228 my $response = $self->ua->request(POST $url,[%params]);
229 $response->proxy_authorization_basic($self->authentication)
230 if ( $self->authentication);
231 $self->_parse_response($response->content);
232 my ($cookie, $querykey) = $self->cookie;
233 my %qualifiers = ('-mode' => 'webenv',
234 '-seq_start' => $seq_start,
235 '-seq_stop' => $seq_stop,
236 '-strand' => $strand,
237 '-complexity' => $complexity,
238 '-format' => $format);
239 return $self->get_request(%qualifiers);
240 } else {
241 $url->query_form(%params);
242 return GET $url;
246 =head2 get_Stream_by_batch
248 Title : get_Stream_by_batch
249 Usage : $seq = $db->get_Stream_by_batch($ref);
250 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
251 at a time. For large numbers of sequences, this is far superior
252 than get_Stream_by_[id/acc]().
253 Example :
254 Returns : a Bio::SeqIO stream object
255 Args : $ref : either an array reference, a filename, or a filehandle
256 from which to get the list of unique ids/accession numbers.
258 NOTE: deprecated API. Use get_Stream_by_id() instead.
260 =cut
262 *get_Stream_by_batch = sub {
263 my $self = shift;
264 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
265 $self->get_Stream_by_id(@_)
268 =head2 get_Stream_by_query
270 Title : get_Stream_by_query
271 Usage : $seq = $db->get_Stream_by_query($query);
272 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
273 at a time. For large numbers of sequences, this is far superior
274 than get_Stream_by_[id/acc]().
275 Example :
276 Returns : a Bio::SeqIO stream object
277 Args : $query : An Entrez query string or a
278 Bio::DB::Query::GenBank object. It is suggested that you
279 create a Bio::DB::Query::GenBank object and get the entry
280 count before you fetch a potentially large stream.
282 =cut
284 sub get_Stream_by_query {
285 my ($self, $query) = @_;
286 unless (ref $query && $query->can('query')) {
287 $query = Bio::DB::Query::GenBank->new($query);
289 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
292 =head2 postprocess_data
294 Title : postprocess_data
295 Usage : $self->postprocess_data ( 'type' => 'string',
296 'location' => \$datastr);
297 Function: process downloaded data before loading into a Bio::SeqIO
298 Returns : void
299 Args : hash with two keys - 'type' can be 'string' or 'file'
300 - 'location' either file location or string
301 reference containing data
303 =cut
305 # the default method, works for genbank/genpept, other classes should
306 # override it with their own method.
308 sub postprocess_data {
309 # retain this in case postprocessing is needed at a future date
313 =head2 request_format
315 Title : request_format
316 Usage : my ($req_format, $ioformat) = $self->request_format;
317 $self->request_format("genbank");
318 $self->request_format("fasta");
319 Function: Get/Set sequence format retrieval. The get-form will normally not
320 be used outside of this and derived modules.
321 Returns : Array of two strings, the first representing the format for
322 retrieval, and the second specifying the corresponding SeqIO format.
323 Args : $format = sequence format
325 =cut
327 sub request_format {
328 my ($self, $value) = @_;
329 if( defined $value ) {
330 $value = lc $value;
331 if( defined $FORMATMAP{$value} ) {
332 $self->{'_format'} = [ $value, $FORMATMAP{$value}];
333 } else {
334 # Try to fall back to a default. Alternatively, we could throw
335 # an exception
336 $self->{'_format'} = [ $value, $value ];
339 return @{$self->{'_format'}};
342 =head2 redirect_refseq
344 Title : redirect_refseq
345 Usage : $db->redirect_refseq(1)
346 Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
347 Returns : Boolean value
348 Args : Boolean value (optional)
349 Throws : 'unparseable output exception'
350 Note : This replaces 'no_redirect' as a more straightforward flag to
351 redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
352 instead of retrievign the NCBI records
354 =cut
356 sub redirect_refseq {
357 my $self = shift;
358 return $self->{'_redirect_refseq'} = shift if @_;
359 return $self->{'_redirect_refseq'};
362 =head2 complexity
364 Title : complexity
365 Usage : $db->complexity(3)
366 Function: get/set complexity value
367 Returns : value from 0-4 indicating level of complexity
368 Args : value from 0-4 (optional); if unset server assumes 1
369 Throws : if arg is not an integer or falls outside of noted range above
370 Note : From efetch docs:
372 Complexity regulates the display:
374 * 0 - get the whole blob
375 * 1 - get the bioseq for gi of interest (default in Entrez)
376 * 2 - get the minimal bioseq-set containing the gi of interest
377 * 3 - get the minimal nuc-prot containing the gi of interest
378 * 4 - get the minimal pub-set containing the gi of interest
380 =cut
382 sub complexity {
383 my ($self, $comp) = @_;
384 if (defined $comp) {
385 $self->throw("Complexity value must be integer between 0 and 4") if
386 $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
387 $self->{'_complexity'} = $comp;
389 return $self->{'_complexity'};
392 =head2 strand
394 Title : strand
395 Usage : $db->strand(1)
396 Function: get/set strand value
397 Returns : strand value if set
398 Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
399 Throws : if arg is not an integer or is not 1 or 2
400 Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
401 We should probably add in some functionality to convert over in the future.
403 =cut
405 sub strand {
406 my ($self, $str) = @_;
407 if ($str) {
408 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
409 $str !~ /^\d+$/ || $str < 1 || $str > 2;
410 $self->{'_strand'} = $str;
412 return $self->{'_strand'};
415 =head2 seq_start
417 Title : seq_start
418 Usage : $db->seq_start(123)
419 Function: get/set sequence start location
420 Returns : sequence start value if set
421 Args : integer; if unset server assumes 1
422 Throws : if arg is not an integer
424 =cut
426 sub seq_start {
427 my ($self, $start) = @_;
428 if ($start) {
429 $self->throw("seq_start() must be integer value if set") if
430 $start !~ /^\d+$/;
431 $self->{'_seq_start'} = $start;
433 return $self->{'_seq_start'};
436 =head2 seq_stop
438 Title : seq_stop
439 Usage : $db->seq_stop(456)
440 Function: get/set sequence stop (end) location
441 Returns : sequence stop (end) value if set
442 Args : integer; if unset server assumes 1
443 Throws : if arg is not an integer
445 =cut
447 sub seq_stop {
448 my ($self, $stop) = @_;
449 if ($stop) {
450 $self->throw("seq_stop() must be integer if set") if
451 $stop !~ /^\d+$/;
452 $self->{'_seq_stop'} = $stop;
454 return $self->{'_seq_stop'};
457 =head2 Bio::DB::WebDBSeqI methods
459 Overriding WebDBSeqI method to help newbies to retrieve sequences
461 =head2 get_Stream_by_acc
463 Title : get_Stream_by_acc
464 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
465 Function: Gets a series of Seq objects by accession numbers
466 Returns : a Bio::SeqIO stream object
467 Args : $ref : a reference to an array of accession numbers for
468 the desired sequence entries
469 Note : For GenBank, this just calls the same code for get_Stream_by_id()
471 =cut
473 sub get_Stream_by_acc {
474 my ($self, $ids ) = @_;
475 my $newdb = $self->_check_id($ids);
476 if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
477 return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
478 } else {
479 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
484 =head2 _check_id
486 Title : _check_id
487 Usage :
488 Function:
489 Returns : A Bio::DB::RefSeq reference or throws
490 Args : $id(s), $string
492 =cut
494 sub _check_id {
495 my ($self, $ids) = @_;
497 # NT contigs can not be retrieved
498 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
499 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
500 if $ids =~ /NT_/;
502 # Asking for a RefSeq from EMBL/GenBank
504 if ($self->redirect_refseq) {
505 if ($ids =~ /N._/) {
506 $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
507 " Redirecting the request.\n")
508 if $self->verbose >= 0;
509 return Bio::DB::RefSeq->new();
514 =head2 delay_policy
516 Title : delay_policy
517 Usage : $secs = $self->delay_policy
518 Function: return number of seconds to delay between calls to remote db
519 Returns : number of seconds to delay
520 Args : none
522 NOTE: NCBI requests a delay of 3 seconds between requests. This method
523 implements that policy.
525 =cut
527 sub delay_policy {
528 my $self = shift;
529 return 3;
532 =head2 cookie
534 Title : cookie
535 Usage : ($cookie,$querynum) = $db->cookie
536 Function: return the NCBI query cookie
537 Returns : list of (cookie,querynum)
538 Args : none
540 NOTE: this information is used by Bio::DB::GenBank in
541 conjunction with efetch.
543 =cut
545 # ripped from Bio::DB::Query::GenBank
546 sub cookie {
547 my $self = shift;
548 if (@_) {
549 $self->{'_cookie'} = shift;
550 $self->{'_querynum'} = shift;
552 else {
553 return @{$self}{qw(_cookie _querynum)};
557 =head2 _parse_response
559 Title : _parse_response
560 Usage : $db->_parse_response($content)
561 Function: parse out response for cookie
562 Returns : empty
563 Args : none
564 Throws : 'unparseable output exception'
566 =cut
568 # trimmed-down version of _parse_response from Bio::DB::Query::GenBank
569 sub _parse_response {
570 my $self = shift;
571 my $content = shift;
572 if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
573 $self->warn("Warning(s) from GenBank: $warning\n");
575 if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
576 $self->throw("Error from Genbank: $error");
578 my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
579 my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
580 $self->cookie(uri_unescape($cookie),$querykey);
583 ########### DEPRECATED!!!! ###########
585 =head2 no_redirect
587 Title : no_redirect
588 Usage : $db->no_redirect($content)
589 Function: Used to indicate that Bio::DB::GenBank instance retrieves
590 possible RefSeqs from EBI instead; default behavior is now to
591 retrieve directly from NCBI
592 Returns : None
593 Args : None
594 Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
596 =cut
598 sub no_redirect {
599 shift->throw(
600 "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
601 "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
602 "redirect_refseq flag to 1");
607 __END__