small update
[bioperl-live.git] / Bio / DB / SwissProt.pm
blob9dcdabf48b66fd3b1872041481c0ef62231db256
2 # $Id$
4 # BioPerl module for Bio::DB::SwissProt
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
13 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
15 =head1 NAME
17 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
19 =head1 SYNOPSIS
21 use Bio::DB::SwissProt;
23 $sp = Bio::DB::SwissProt->new();
25 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
26 # <4-letter-identifier>_<species 5-letter code>
27 # or ...
28 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
29 # [OPQ]xxxxx
32 # In fact in this implementation
33 # these methods call the same webscript so you can use
34 # then interchangeably
36 # choose a different server to query
37 $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
38 '-hostlocation' => 'us');
40 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
42 =head1 DESCRIPTION
44 SwissProt is a curated database of proteins managed by the Swiss
45 Bioinformatics Institute. Additional tools for
46 parsing and manipulating swissprot files can be found at
47 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
49 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
50 SwissProt database via an Expasy retrieval.
52 In order to make changes transparent we have host type (currently only
53 expasy) and location (default to Switzerland) separated out. This
54 allows the user to pick the closest Expasy mirror for running their
55 queries.
58 =head1 FEEDBACK
60 =head2 Mailing Lists
62 User feedback is an integral part of the evolution of this and other
63 Bioperl modules. Send your comments and suggestions preferably to one
64 of the Bioperl mailing lists. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Reporting Bugs
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the
74 web:
76 http://bugzilla.open-bio.org/
78 =head1 AUTHOR - Jason Stajich
80 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
82 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
83 Institute of Bioinformatics for helping point us in the direction of
84 the correct expasy scripts and for swissknife references.
86 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt>
87 for help with adding EBI swall server.
89 =head1 APPENDIX
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
94 =cut
96 # Let the code begin...
98 package Bio::DB::SwissProt;
99 use strict;
100 use vars qw($MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE);
102 $MODVERSION = '0.8.1';
103 use HTTP::Request::Common;
105 use base qw(Bio::DB::WebDBSeqI);
107 # global vars
108 $DEFAULTSERVERTYPE = 'ebi';
109 $DEFAULTFORMAT = 'swissprot';
111 # you can add your own here theoretically.
112 %HOSTS = (
113 'expasy' => {
114 'default' => 'us',
115 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
116 'hosts' =>
118 'switzerland' => 'ch.expasy.org',
119 'canada' => 'ca.expasy.org',
120 'china' => 'cn.expasy.org',
121 'taiwan' => 'tw.expasy.org',
122 'australia' => 'au.expasy.org',
123 'korea' => 'kr.expasy.org',
124 'us' => 'us.expasy.org',
126 # ick, CGI variables
127 'jointype' => ' ',
128 'idvar' => 'list',
129 'basevars' => [ ],
131 'ebi' => {
132 'default' => 'uk',
133 'baseurl' => 'http://%s/cgi-bin/dbfetch',
134 'hosts' => {
135 'uk' => 'www.ebi.ac.uk',
137 'jointype' => ',',
138 'idvar' => 'id',
139 'basevars' => [ 'db' => 'UniProtKB',
140 'style' => 'raw' ],
144 # new modules should be a little more lightweight and
145 # should use Bio::Root::Root
146 sub new {
147 my ($class, @args) = @_;
148 my $self = $class->SUPER::new(@args);
150 my ($format, $hostlocation,$servertype) =
151 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
152 @args);
154 if( $format && $format !~ /(swiss)|(fasta)/i ) {
155 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
156 $format = $self->default_format;
158 $servertype = $DEFAULTSERVERTYPE unless $servertype;
159 $servertype = lc $servertype;
160 $self->servertype($servertype);
161 if ( $hostlocation ) {
162 $self->hostlocation(lc $hostlocation);
165 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
166 return $self;
169 =head2 Routines from Bio::DB::RandomAccessI
171 =cut
173 =head2 get_Seq_by_id
175 Title : get_Seq_by_id
176 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
177 Function: Gets a Bio::Seq object by its name
178 Returns : a Bio::Seq object
179 Args : the id (as a string) of a sequence
180 Throws : "id does not exist" exception
182 =cut
184 =head2 get_Seq_by_acc
186 Title : get_Seq_by_acc
187 Usage : $seq = $db->get_Seq_by_acc('X77802');
188 Function: Gets a Bio::Seq object by accession number
189 Returns : A Bio::Seq object
190 Args : accession number (as a string)
191 Throws : "acc does not exist" exception
193 =cut
195 =head2 get_Stream_by_id
197 Title : get_Stream_by_id
198 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
199 Function: Gets a series of Seq objects by unique identifiers
200 Returns : a Bio::SeqIO stream object
201 Args : $ref : a reference to an array of unique identifiers for
202 the desired sequence entries
204 =cut
206 =head2 get_Stream_by_acc
208 Title : get_Stream_by_acc
209 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
210 Function: Gets a series of Seq objects by accession numbers
211 Returns : a Bio::SeqIO stream object
212 Args : $ref : a reference to an array of accession numbers for
213 the desired sequence entries
214 Note : For GenBank, this just calls the same code for get_Stream_by_id()
216 =cut
218 =head2 get_Stream_by_batch
220 Title : get_Stream_by_batch
221 Usage : $seq = $db->get_Stream_by_batch($ref);
222 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
223 at a time. This is implemented the same way as get_Stream_by_id,
224 but is provided here in keeping with access methods of NCBI
225 modules.
226 Example :
227 Returns : a Bio::SeqIO stream object
228 Args : $ref : either an array reference, a filename, or a filehandle
229 from which to get the list of unique ids/accession numbers.
231 NOTE: deprecated API. Use get_Stream_by_id() instead.
233 =cut
235 *get_Stream_by_batch = sub {
236 my $self = shift;
237 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
238 $self->get_Stream_by_id(@_)
241 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
243 =cut
245 =head2 get_request
247 Title : get_request
248 Usage : my $url = $self->get_request
249 Function: returns a HTTP::Request object
250 Returns :
251 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
253 =cut
255 sub get_request {
256 my ($self, @qualifiers) = @_;
257 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
258 @qualifiers);
260 if( !defined $uids ) {
261 $self->throw("Must specify a value for uids to query");
263 my ($f,undef) = $self->request_format($format);
265 my %vars = (
266 @{$HOSTS{$self->servertype}->{'basevars'}},
267 ( 'format' => $f )
270 my $url = $self->location_url;
272 my $uid;
273 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
274 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
276 if( ref($uids) =~ /ARRAY/i ) {
277 # HTTP::Request automagically converts the ' ' to %20
278 $uid = join($jointype, @$uids);
279 } else {
280 $uid = $uids;
282 $vars{$idvar} = $uid;
284 return POST $url, \%vars;
287 =head2 postprocess_data
289 Title : postprocess_data
290 Usage : $self->postprocess_data ( 'type' => 'string',
291 'location' => \$datastr);
292 Function: process downloaded data before loading into a Bio::SeqIO
293 Returns : void
294 Args : hash with two keys - 'type' can be 'string' or 'file'
295 - 'location' either file location or string
296 reference containing data
298 =cut
300 # don't need to do anything
302 sub postprocess_data {
303 my ($self, %args) = @_;
304 return;
307 =head2 default_format
309 Title : default_format
310 Usage : my $format = $self->default_format
311 Function: Returns default sequence format for this module
312 Returns : string
313 Args : none
315 =cut
317 sub default_format {
318 return $DEFAULTFORMAT;
321 =head2 Bio::DB::SwissProt specific routines
323 =cut
325 =head2 servertype
327 Title : servertype
328 Usage : my $servertype = $self->servertype
329 $self->servertype($servertype);
330 Function: Get/Set server type
331 Returns : string
332 Args : server type string [optional]
334 =cut
336 sub servertype {
337 my ($self, $servertype) = @_;
338 if( defined $servertype && $servertype ne '') {
339 $self->throw("You gave an invalid server type ($servertype)".
340 " - available types are ".
341 keys %HOSTS) unless( $HOSTS{$servertype} );
342 $self->{'_servertype'} = $servertype;
343 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
345 # make sure format is reset properly in that different
346 # servers have different syntaxes
347 my ($existingformat,$seqioformat) = $self->request_format;
348 $self->request_format($existingformat);
350 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
354 =head2 hostlocation
356 Title : hostlocation
357 Usage : my $location = $self->hostlocation()
358 $self->hostlocation($location)
359 Function: Set/Get Hostlocation
360 Returns : string representing hostlocation
361 Args : string specifying hostlocation [optional]
363 =cut
365 sub hostlocation {
366 my ($self, $location ) = @_;
367 $location = lc $location;
368 my $servertype = $self->servertype;
369 $self->throw("Must have a valid servertype defined not $servertype")
370 unless defined $servertype;
371 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
372 if( defined $location && $location ne '' ) {
373 if( ! $hosts{$location} ) {
374 $self->throw("Must specify a known host, not $location,".
375 " possible values (".
376 join(",", sort keys %hosts ). ")");
378 $self->{'_hostlocation'} = $location;
380 return $self->{'_hostlocation'};
383 =head2 location_url
385 Title : location
386 Usage : my $url = $self->location_url()
387 Function: Get host url
388 Returns : string representing url
389 Args : none
391 =cut
393 sub location_url {
394 my ($self) = @_;
395 my $servertype = $self->servertype();
396 my $location = $self->hostlocation();
398 if( ! defined $location || !defined $servertype ) {
399 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
401 return sprintf($HOSTS{$servertype}->{'baseurl'},
402 $HOSTS{$servertype}->{'hosts'}->{$location});
405 =head2 request_format
407 Title : request_format
408 Usage : my ($req_format, $ioformat) = $self->request_format;
409 $self->request_format("genbank");
410 $self->request_format("fasta");
411 Function: Get/Set sequence format retrieval. The get-form will normally
412 not be used outside of this and derived modules.
413 Returns : Array of two strings, the first representing the format for
414 retrieval, and the second specifying the corresponding SeqIO
415 format.
416 Args : $format = sequence format
418 =cut
420 sub request_format {
421 my ($self, $value) = @_;
422 if( defined $value ) {
423 if( $self->servertype =~ /expasy/ ) {
424 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
425 $self->{'_format'} = [ 'sprot', 'swiss'];
426 } elsif( $value =~ /^fa/ ) {
427 $self->{'_format'} = [ 'fasta', 'fasta'];
428 } else {
429 $self->warn("Unrecognized format $value requested");
430 $self->{'_format'} = [ 'fasta', 'fasta'];
432 } elsif( $self->servertype =~ /ebi/ ) {
433 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
434 $self->{'_format'} = [ 'swissprot', 'swiss' ];
435 } elsif( $value =~ /^fa/ ) {
436 $self->{'_format'} = [ 'fasta', 'fasta'];
437 } else {
438 $self->warn("Unrecognized format $value requested");
439 $self->{'_format'} = [ 'swissprot', 'swiss'];
443 return @{$self->{'_format'}};
448 __END__