small update
[bioperl-live.git] / Bio / DB / GenBank.pm
blob52910d050e6b70f46f07527d5e92297d1ead95a3
1 # $Id$
3 # BioPerl module for Bio::DB::GenBank
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
7 # Copyright Aaron Mackey
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 # Added LWP support - Jason Stajich 2000-11-6
14 # completely reworked by Jason Stajich 2000-12-8
15 # to use WebDBSeqI
17 # Added batch entrez back when determined that new entrez cgi will
18 # essentially work (there is a limit to the number of characters in a
19 # GET request so I am not sure how we can get around this). The NCBI
20 # Batch Entrez form has changed some and it does not support retrieval
21 # of text only data. Still should investigate POST-ing (tried and
22 # failed) a message to the entrez cgi to get around the GET
23 # limitations.
25 =head1 NAME
27 Bio::DB::GenBank - Database object interface to GenBank
29 =head1 SYNOPSIS
31 use Bio::DB::GenBank;
32 $gb = Bio::DB::GenBank->new();
34 $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID
36 # or ...
38 $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number
39 $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version
40 $seq = $gb->get_Seq_by_gi('405830'); # GI Number
42 # get a stream via a query string
43 my $query = Bio::DB::Query::GenBank->new
44 (-query =>'Oryza sativa[Organism] AND EST',
45 -reldate => '30',
46 -db => 'nucleotide');
47 my $seqio = $gb->get_Stream_by_query($query);
49 while( my $seq = $seqio->next_seq ) {
50 print "seq length is ", $seq->length,"\n";
53 # or ... best when downloading very large files, prevents
54 # keeping all of the file in memory
56 # also don't want features, just sequence so let's save bandwith
57 # and request Fasta sequence
58 $gb = Bio::DB::GenBank->new(-retrievaltype => 'tempfile' ,
59 -format => 'Fasta');
60 my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] );
61 while( my $clone = $seqio->next_seq ) {
62 print "cloneid is ", $clone->display_id, " ",
63 $clone->accession_number, "\n";
65 # note that get_Stream_by_version is not implemented
67 # don't want the entire sequence or more options
68 my $gb = Bio::DB::GenBank->new(-format => 'Fasta',
69 -seq_start => 100,
70 -seq_stop => 200,
71 -strand => 1,
72 -complexity => 4);
73 my $seqi = $gb->get_Stream_by_query($query);
76 =head1 DESCRIPTION
78 Allows the dynamic retrieval of L<Bio::Seq> sequence objects from the
79 GenBank database at NCBI, via an Entrez query.
81 WARNING: Please do B<NOT> spam the Entrez web server with multiple
82 requests. NCBI offers Batch Entrez for this purpose.
84 Note that when querying for GenBank accessions starting with 'NT_' you
85 will need to call $gb-E<gt>request_format('fasta') beforehand, because
86 in GenBank format (the default) the sequence part will be left out
87 (the reason is that NT contigs are rather annotation with references
88 to clones).
90 Some work has been done to automatically detect and retrieve whole NT_
91 clones when the data is in that format (NCBI RefSeq clones). The
92 former behavior prior to bioperl 1.6 was to retrieve these from EBI,
93 but now these are retrieved directly from NCBI. The older behavior can
94 be regained by setting the 'redirect_refseq' flag to a value
95 evaluating to TRUE.
97 =head2 Running
99 Alternate methods are described at
100 L<http://www.ncbi.nlm.nih.gov/entrez/query/static/efetchseq_help.html>
102 NOTE: strand should be 1 for plus or 2 for minus.
104 Complexity: gi is often a part of a biological blob, containing other
107 complexity regulates the display:
108 0 - get the whole blob
109 1 - get the bioseq for gi of interest (default in Entrez)
110 2 - get the minimal bioseq-set containing the gi of interest
111 3 - get the minimal nuc-prot containing the gi of interest
112 4 - get the minimal pub-set containing the gi of interest
114 'seq_start' and 'seq_stop' will not work when setting complexity to
115 any value other than 1. 'strand' works for any setting other than a
116 complexity of 0 (whole glob); when you try this with a GenBank return
117 format nothing happens, whereas using FASTA works but causes display
118 problems with the other sequences in the glob. As Tao Tao says from
119 NCBI, "Better left it out or set it to 1."
121 =head1 FEEDBACK
123 =head2 Mailing Lists
125 User feedback is an integral part of the evolution of this and other
126 Bioperl modules. Send your comments and suggestions preferably to one
127 of the Bioperl mailing lists. Your participation is much appreciated.
129 bioperl-l@bioperl.org - General discussion
130 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
132 =head2 Reporting Bugs
134 Report bugs to the Bioperl bug tracking system to help us keep track
135 the bugs and their resolution. Bug reports can be submitted via the
136 web:
138 http://bugzilla.open-bio.org/
140 =head1 AUTHOR - Aaron Mackey, Jason Stajich
142 Email amackey@virginia.edu
143 Email jason@bioperl.org
145 =head1 APPENDIX
147 The rest of the documentation details each of the
148 object methods. Internal methods are usually
149 preceded with a _
151 =cut
153 # Let the code begin...
155 package Bio::DB::GenBank;
156 use strict;
157 use vars qw(%PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE);
159 use base qw(Bio::DB::NCBIHelper);
160 BEGIN {
161 $DEFAULTMODE = 'single';
162 $DEFAULTFORMAT = 'gbwithparts';
163 %PARAMSTRING = (
164 'batch' => { 'db' => 'nucleotide',
165 'usehistory' => 'n',
166 'tool' => 'bioperl'},
167 'query' => { 'usehistory' => 'y',
168 'tool' => 'bioperl',
169 'retmode' => 'text'},
170 'gi' => { 'db' => 'nucleotide',
171 'usehistory' => 'n',
172 'tool' => 'bioperl',
173 'retmode' => 'text'},
174 'version' => { 'db' => 'nucleotide',
175 'usehistory' => 'n',
176 'tool' => 'bioperl',
177 'retmode' => 'text'},
178 'single' => { 'db' => 'nucleotide',
179 'usehistory' => 'n',
180 'tool' => 'bioperl',
181 'retmode' => 'text'},
182 'webenv' => {
183 'query_key' => 'querykey',
184 'WebEnv' => 'cookie',
185 'db' => 'nucleotide',
186 'usehistory' => 'n',
187 'tool' => 'bioperl',
188 'retmode' => 'text'},
192 # new is in NCBIHelper
194 # helper method to get db specific options
196 =head2 new
198 Title : new
199 Usage : $gb = Bio::DB::GenBank->new(@options)
200 Function: Creates a new genbank handle
201 Returns : New genbank handle
202 Args : -delay number of seconds to delay between fetches (3s)
204 NOTE: There are other options that are used internally. By NCBI policy, this
205 module introduces a 3s delay between fetches. If you are fetching multiple genbank
206 ids, it is a good idea to use get
208 =cut
210 =head2 get_params
212 Title : get_params
213 Usage : my %params = $self->get_params($mode)
214 Function: Returns key,value pairs to be passed to NCBI database
215 for either 'batch' or 'single' sequence retrieval method
216 Returns : a key,value pair hash
217 Args : 'single' or 'batch' mode for retrieval
219 =cut
221 sub get_params {
222 my ($self, $mode) = @_;
223 return defined $PARAMSTRING{$mode} ?
224 %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}};
227 # from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
229 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
231 =head2 get_Seq_by_id
233 Title : get_Seq_by_id
234 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
235 Function: Gets a Bio::Seq object by its name
236 Returns : a Bio::Seq object
237 Args : the id (as a string) of a sequence
238 Throws : "id does not exist" exception
240 =head2 get_Seq_by_acc
242 Title : get_Seq_by_acc
243 Usage : $seq = $db->get_Seq_by_acc($acc);
244 Function: Gets a Seq object by accession numbers
245 Returns : a Bio::Seq object
246 Args : the accession number as a string
247 Note : For GenBank, this just calls the same code for get_Seq_by_id()
248 Throws : "id does not exist" exception
250 =head2 get_Seq_by_gi
252 Title : get_Seq_by_gi
253 Usage : $seq = $db->get_Seq_by_gi('405830');
254 Function: Gets a Bio::Seq object by gi number
255 Returns : A Bio::Seq object
256 Args : gi number (as a string)
257 Throws : "gi does not exist" exception
259 =head2 get_Seq_by_version
261 Title : get_Seq_by_version
262 Usage : $seq = $db->get_Seq_by_version('X77802.1');
263 Function: Gets a Bio::Seq object by sequence version
264 Returns : A Bio::Seq object
265 Args : accession.version (as a string)
266 Throws : "acc.version does not exist" exception
268 =head1 Routines implemented by Bio::DB::NCBIHelper
270 =head2 get_Stream_by_query
272 Title : get_Stream_by_query
273 Usage : $seq = $db->get_Stream_by_query($query);
274 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
275 at a time. For large numbers of sequences, this is far superior
276 than get_Stream_by_[id/acc]().
277 Example :
278 Returns : a Bio::SeqIO stream object
279 Args : $query : An Entrez query string or a
280 Bio::DB::Query::GenBank object. It is suggested that you
281 create a Bio::DB::Query::GenBank object and get the entry
282 count before you fetch a potentially large stream.
284 =cut
286 =head2 get_Stream_by_id
288 Title : get_Stream_by_id
289 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
290 Function: Gets a series of Seq objects by unique identifiers
291 Returns : a Bio::SeqIO stream object
292 Args : $ref : a reference to an array of unique identifiers for
293 the desired sequence entries
295 =head2 get_Stream_by_acc
297 Title : get_Stream_by_acc
298 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
299 Function: Gets a series of Seq objects by accession numbers
300 Returns : a Bio::SeqIO stream object
301 Args : $ref : a reference to an array of accession numbers for
302 the desired sequence entries
303 Note : For GenBank, this just calls the same code for get_Stream_by_id()
305 =cut
307 =head2 get_Stream_by_gi
309 Title : get_Stream_by_gi
310 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
311 Function: Gets a series of Seq objects by gi numbers
312 Returns : a Bio::SeqIO stream object
313 Args : $ref : a reference to an array of gi numbers for
314 the desired sequence entries
315 Note : For GenBank, this just calls the same code for get_Stream_by_id()
317 =head2 get_Stream_by_batch
319 Title : get_Stream_by_batch
320 Usage : $seq = $db->get_Stream_by_batch($ref);
321 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
322 at a time.
323 Example :
324 Returns : a Bio::SeqIO stream object
325 Args : $ref : either an array reference, a filename, or a filehandle
326 from which to get the list of unique ids/accession numbers.
328 NOTE: This method is redundant and deprecated. Use get_Stream_by_id()
329 instead.
331 =head2 get_request
333 Title : get_request
334 Usage : my $url = $self->get_request
335 Function: HTTP::Request
336 Returns :
337 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
339 =cut
341 =head2 default_format
343 Title : default_format
344 Usage : my $format = $self->default_format
345 Function: Returns default sequence format for this module
346 Returns : string
347 Args : none
349 =cut
351 sub default_format {
352 return $DEFAULTFORMAT;
356 __END__