sync w/ main trunk
[bioperl-live.git] / Bio / DB / GenBank.pm
blob73fe0bc77842f58430f3e60857011ceae0bf9eea
1 # $Id$
3 # BioPerl module for Bio::DB::GenBank
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Aaron Mackey <amackey@virginia.edu>
9 # Copyright Aaron Mackey
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 # Added LWP support - Jason Stajich 2000-11-6
16 # completely reworked by Jason Stajich 2000-12-8
17 # to use WebDBSeqI
19 # Added batch entrez back when determined that new entrez cgi will
20 # essentially work (there is a limit to the number of characters in a
21 # GET request so I am not sure how we can get around this). The NCBI
22 # Batch Entrez form has changed some and it does not support retrieval
23 # of text only data. Still should investigate POST-ing (tried and
24 # failed) a message to the entrez cgi to get around the GET
25 # limitations.
27 =head1 NAME
29 Bio::DB::GenBank - Database object interface to GenBank
31 =head1 SYNOPSIS
33 use Bio::DB::GenBank;
34 $gb = Bio::DB::GenBank->new();
36 $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID
38 # or ...
40 $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number
41 $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version
42 $seq = $gb->get_Seq_by_gi('405830'); # GI Number
44 # get a stream via a query string
45 my $query = Bio::DB::Query::GenBank->new
46 (-query =>'Oryza sativa[Organism] AND EST',
47 -reldate => '30',
48 -db => 'nucleotide');
49 my $seqio = $gb->get_Stream_by_query($query);
51 while( my $seq = $seqio->next_seq ) {
52 print "seq length is ", $seq->length,"\n";
55 # or ... best when downloading very large files, prevents
56 # keeping all of the file in memory
58 # also don't want features, just sequence so let's save bandwith
59 # and request Fasta sequence
60 $gb = Bio::DB::GenBank->new(-retrievaltype => 'tempfile' ,
61 -format => 'Fasta');
62 my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] );
63 while( my $clone = $seqio->next_seq ) {
64 print "cloneid is ", $clone->display_id, " ",
65 $clone->accession_number, "\n";
67 # note that get_Stream_by_version is not implemented
69 # don't want the entire sequence or more options
70 my $gb = Bio::DB::GenBank->new(-format => 'Fasta',
71 -seq_start => 100,
72 -seq_stop => 200,
73 -strand => 1,
74 -complexity => 4);
75 my $seqi = $gb->get_Stream_by_query($query);
78 =head1 DESCRIPTION
80 Allows the dynamic retrieval of L<Bio::Seq> sequence objects from the
81 GenBank database at NCBI, via an Entrez query.
83 WARNING: Please do B<NOT> spam the Entrez web server with multiple
84 requests. NCBI offers Batch Entrez for this purpose.
86 Note that when querying for GenBank accessions starting with 'NT_' you
87 will need to call $gb-E<gt>request_format('fasta') beforehand, because
88 in GenBank format (the default) the sequence part will be left out
89 (the reason is that NT contigs are rather annotation with references
90 to clones).
92 Some work has been done to automatically detect and retrieve whole NT_
93 clones when the data is in that format (NCBI RefSeq clones). The
94 former behavior prior to bioperl 1.6 was to retrieve these from EBI,
95 but now these are retrieved directly from NCBI. The older behavior can
96 be regained by setting the 'redirect_refseq' flag to a value
97 evaluating to TRUE.
99 =head2 Running
101 Alternate methods are described at
102 L<http://www.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html>
104 NOTE: strand should be 1 for plus or 2 for minus.
106 Complexity: gi is often a part of a biological blob, containing other
109 complexity regulates the display:
110 0 - get the whole blob
111 1 - get the bioseq for gi of interest (default in Entrez)
112 2 - get the minimal bioseq-set containing the gi of interest
113 3 - get the minimal nuc-prot containing the gi of interest
114 4 - get the minimal pub-set containing the gi of interest
116 'seq_start' and 'seq_stop' will not work when setting complexity to
117 any value other than 1. 'strand' works for any setting other than a
118 complexity of 0 (whole glob); when you try this with a GenBank return
119 format nothing happens, whereas using FASTA works but causes display
120 problems with the other sequences in the glob. As Tao Tao says from
121 NCBI, "Better left it out or set it to 1."
123 =head1 FEEDBACK
125 =head2 Mailing Lists
127 User feedback is an integral part of the evolution of this and other
128 Bioperl modules. Send your comments and suggestions preferably to one
129 of the Bioperl mailing lists. Your participation is much appreciated.
131 bioperl-l@bioperl.org - General discussion
132 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
134 =head2 Support
136 Please direct usage questions or support issues to the mailing list:
138 L<bioperl-l@bioperl.org>
140 rather than to the module maintainer directly. Many experienced and
141 reponsive experts will be able look at the problem and quickly
142 address it. Please include a thorough description of the problem
143 with code and data examples if at all possible.
145 =head2 Reporting Bugs
147 Report bugs to the Bioperl bug tracking system to help us keep track
148 the bugs and their resolution. Bug reports can be submitted via the
149 web:
151 http://bugzilla.open-bio.org/
153 =head1 AUTHOR - Aaron Mackey, Jason Stajich
155 Email amackey@virginia.edu
156 Email jason@bioperl.org
158 =head1 APPENDIX
160 The rest of the documentation details each of the
161 object methods. Internal methods are usually
162 preceded with a _
164 =cut
166 # Let the code begin...
168 package Bio::DB::GenBank;
169 use strict;
170 use vars qw(%PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE);
172 use base qw(Bio::DB::NCBIHelper);
173 BEGIN {
174 $DEFAULTMODE = 'single';
175 $DEFAULTFORMAT = 'gbwithparts';
176 %PARAMSTRING = (
177 'batch' => { 'db' => 'nucleotide',
178 'usehistory' => 'n',
179 'tool' => 'bioperl'},
180 'query' => { 'usehistory' => 'y',
181 'tool' => 'bioperl',
182 'retmode' => 'text'},
183 'gi' => { 'db' => 'nucleotide',
184 'usehistory' => 'n',
185 'tool' => 'bioperl',
186 'retmode' => 'text'},
187 'version' => { 'db' => 'nucleotide',
188 'usehistory' => 'n',
189 'tool' => 'bioperl',
190 'retmode' => 'text'},
191 'single' => { 'db' => 'nucleotide',
192 'usehistory' => 'n',
193 'tool' => 'bioperl',
194 'retmode' => 'text'},
195 'webenv' => {
196 'query_key' => 'querykey',
197 'WebEnv' => 'cookie',
198 'db' => 'nucleotide',
199 'usehistory' => 'n',
200 'tool' => 'bioperl',
201 'retmode' => 'text'},
205 # new is in NCBIHelper
207 # helper method to get db specific options
209 =head2 new
211 Title : new
212 Usage : $gb = Bio::DB::GenBank->new(@options)
213 Function: Creates a new genbank handle
214 Returns : New genbank handle
215 Args : -delay number of seconds to delay between fetches (3s)
217 NOTE: There are other options that are used internally. By NCBI policy, this
218 module introduces a 3s delay between fetches. If you are fetching multiple genbank
219 ids, it is a good idea to use get
221 =cut
223 =head2 get_params
225 Title : get_params
226 Usage : my %params = $self->get_params($mode)
227 Function: Returns key,value pairs to be passed to NCBI database
228 for either 'batch' or 'single' sequence retrieval method
229 Returns : a key,value pair hash
230 Args : 'single' or 'batch' mode for retrieval
232 =cut
234 sub get_params {
235 my ($self, $mode) = @_;
236 return defined $PARAMSTRING{$mode} ?
237 %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}};
240 # from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
242 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
244 =head2 get_Seq_by_id
246 Title : get_Seq_by_id
247 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
248 Function: Gets a Bio::Seq object by its name
249 Returns : a Bio::Seq object
250 Args : the id (as a string) of a sequence
251 Throws : "id does not exist" exception
253 =head2 get_Seq_by_acc
255 Title : get_Seq_by_acc
256 Usage : $seq = $db->get_Seq_by_acc($acc);
257 Function: Gets a Seq object by accession numbers
258 Returns : a Bio::Seq object
259 Args : the accession number as a string
260 Note : For GenBank, this just calls the same code for get_Seq_by_id()
261 Throws : "id does not exist" exception
263 =head2 get_Seq_by_gi
265 Title : get_Seq_by_gi
266 Usage : $seq = $db->get_Seq_by_gi('405830');
267 Function: Gets a Bio::Seq object by gi number
268 Returns : A Bio::Seq object
269 Args : gi number (as a string)
270 Throws : "gi does not exist" exception
272 =head2 get_Seq_by_version
274 Title : get_Seq_by_version
275 Usage : $seq = $db->get_Seq_by_version('X77802.1');
276 Function: Gets a Bio::Seq object by sequence version
277 Returns : A Bio::Seq object
278 Args : accession.version (as a string)
279 Throws : "acc.version does not exist" exception
281 =head1 Routines implemented by Bio::DB::NCBIHelper
283 =head2 get_Stream_by_query
285 Title : get_Stream_by_query
286 Usage : $seq = $db->get_Stream_by_query($query);
287 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
288 at a time. For large numbers of sequences, this is far superior
289 than get_Stream_by_[id/acc]().
290 Example :
291 Returns : a Bio::SeqIO stream object
292 Args : $query : An Entrez query string or a
293 Bio::DB::Query::GenBank object. It is suggested that you
294 create a Bio::DB::Query::GenBank object and get the entry
295 count before you fetch a potentially large stream.
297 =cut
299 =head2 get_Stream_by_id
301 Title : get_Stream_by_id
302 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
303 Function: Gets a series of Seq objects by unique identifiers
304 Returns : a Bio::SeqIO stream object
305 Args : $ref : a reference to an array of unique identifiers for
306 the desired sequence entries
308 =head2 get_Stream_by_acc
310 Title : get_Stream_by_acc
311 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
312 Function: Gets a series of Seq objects by accession numbers
313 Returns : a Bio::SeqIO stream object
314 Args : $ref : a reference to an array of accession numbers for
315 the desired sequence entries
316 Note : For GenBank, this just calls the same code for get_Stream_by_id()
318 =cut
320 =head2 get_Stream_by_gi
322 Title : get_Stream_by_gi
323 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
324 Function: Gets a series of Seq objects by gi numbers
325 Returns : a Bio::SeqIO stream object
326 Args : $ref : a reference to an array of gi numbers for
327 the desired sequence entries
328 Note : For GenBank, this just calls the same code for get_Stream_by_id()
330 =head2 get_Stream_by_batch
332 Title : get_Stream_by_batch
333 Usage : $seq = $db->get_Stream_by_batch($ref);
334 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
335 at a time.
336 Example :
337 Returns : a Bio::SeqIO stream object
338 Args : $ref : either an array reference, a filename, or a filehandle
339 from which to get the list of unique ids/accession numbers.
341 NOTE: This method is redundant and deprecated. Use get_Stream_by_id()
342 instead.
344 =head2 get_request
346 Title : get_request
347 Usage : my $url = $self->get_request
348 Function: HTTP::Request
349 Returns :
350 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
352 =cut
354 =head2 default_format
356 Title : default_format
357 Usage : my $format = $self->default_format
358 Function: Returns default sequence format for this module
359 Returns : string
360 Args : none
362 =cut
364 sub default_format {
365 return $DEFAULTFORMAT;
369 __END__