sync with trunk to r15684
[bioperl-live.git] / Bio / DB / SwissProt.pm
blob5a64d80e31cfa043711c8c240aead71a162eadf0
2 # $Id$
4 # BioPerl module for Bio::DB::SwissProt
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Jason Stajich <jason@bioperl.org>
10 # Copyright Jason Stajich
12 # You may distribute this module under the same terms as perl itself
14 # POD documentation - main docs before the code
15 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
17 =head1 NAME
19 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
21 =head1 SYNOPSIS
23 use Bio::DB::SwissProt;
25 $sp = Bio::DB::SwissProt->new();
27 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
28 # <4-letter-identifier>_<species 5-letter code>
29 # or ...
30 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
31 # [OPQ]xxxxx
34 # In fact in this implementation
35 # these methods call the same webscript so you can use
36 # then interchangeably
38 # choose a different server to query
39 $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
40 '-hostlocation' => 'us');
42 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
44 =head1 DESCRIPTION
46 SwissProt is a curated database of proteins managed by the Swiss
47 Bioinformatics Institute. Additional tools for
48 parsing and manipulating swissprot files can be found at
49 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
51 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
52 SwissProt database via an Expasy retrieval.
54 In order to make changes transparent we have host type (currently only
55 expasy) and location (default to Switzerland) separated out. This
56 allows the user to pick the closest Expasy mirror for running their
57 queries.
60 =head1 FEEDBACK
62 =head2 Mailing Lists
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to one
66 of the Bioperl mailing lists. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 =head2 Support
74 Please direct usage questions or support issues to the mailing list:
76 L<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
83 =head2 Reporting Bugs
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
87 web:
89 http://bugzilla.open-bio.org/
91 =head1 AUTHOR - Jason Stajich
93 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
95 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
96 Institute of Bioinformatics for helping point us in the direction of
97 the correct expasy scripts and for swissknife references.
99 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt>
100 for help with adding EBI swall server.
102 =head1 APPENDIX
104 The rest of the documentation details each of the object
105 methods. Internal methods are usually preceded with a _
107 =cut
109 # Let the code begin...
111 package Bio::DB::SwissProt;
112 use strict;
114 use HTTP::Request::Common;
115 our $MODVERSION = '0.8.1';
117 use base qw(Bio::DB::WebDBSeqI);
119 # global vars
120 our $DEFAULTSERVERTYPE = 'ebi';
121 our $DEFAULTFORMAT = 'swissprot';
122 our $DEFAULTIDTRACKER = 'http://www.expasy.ch';
124 # you can add your own here theoretically.
125 our %HOSTS = (
126 'expasy' => {
127 'default' => 'us',
128 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
129 'hosts' =>
131 'switzerland' => 'ch.expasy.org',
132 'canada' => 'ca.expasy.org',
133 'china' => 'cn.expasy.org',
134 'taiwan' => 'tw.expasy.org',
135 'australia' => 'au.expasy.org',
136 'korea' => 'kr.expasy.org',
137 'us' => 'us.expasy.org',
139 # ick, CGI variables
140 'jointype' => ' ',
141 'idvar' => 'list',
142 'basevars' => [ ],
144 'ebi' => {
145 'default' => 'uk',
146 'baseurl' => 'http://%s/cgi-bin/dbfetch',
147 'hosts' => {
148 'uk' => 'www.ebi.ac.uk',
150 'jointype' => ',',
151 'idvar' => 'id',
152 'basevars' => [ 'db' => 'UniProtKB',
153 'style' => 'raw' ],
157 # new modules should be a little more lightweight and
158 # should use Bio::Root::Root
159 sub new {
160 my ($class, @args) = @_;
161 my $self = $class->SUPER::new(@args);
163 my ($format, $hostlocation,$servertype) =
164 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
165 @args);
167 if( $format && $format !~ /(swiss)|(fasta)/i ) {
168 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
169 $format = $self->default_format;
171 $servertype = $DEFAULTSERVERTYPE unless $servertype;
172 $servertype = lc $servertype;
173 $self->servertype($servertype);
174 if ( $hostlocation ) {
175 $self->hostlocation(lc $hostlocation);
178 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
179 return $self;
182 =head2 Routines from Bio::DB::RandomAccessI
184 =cut
186 =head2 get_Seq_by_id
188 Title : get_Seq_by_id
189 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
190 Function: Gets a Bio::Seq object by its name
191 Returns : a Bio::Seq object
192 Args : the id (as a string) of a sequence
193 Throws : "id does not exist" exception
195 =cut
197 =head2 get_Seq_by_acc
199 Title : get_Seq_by_acc
200 Usage : $seq = $db->get_Seq_by_acc('X77802');
201 Function: Gets a Bio::Seq object by accession number
202 Returns : A Bio::Seq object
203 Args : accession number (as a string)
204 Throws : "acc does not exist" exception
206 =cut
208 =head2 get_Stream_by_id
210 Title : get_Stream_by_id
211 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
212 Function: Gets a series of Seq objects by unique identifiers
213 Returns : a Bio::SeqIO stream object
214 Args : $ref : a reference to an array of unique identifiers for
215 the desired sequence entries
217 =cut
219 =head2 get_Stream_by_acc
221 Title : get_Stream_by_acc
222 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
223 Function: Gets a series of Seq objects by accession numbers
224 Returns : a Bio::SeqIO stream object
225 Args : $ref : a reference to an array of accession numbers for
226 the desired sequence entries
227 Note : For GenBank, this just calls the same code for get_Stream_by_id()
229 =cut
231 =head2 get_Stream_by_batch
233 Title : get_Stream_by_batch
234 Usage : $seq = $db->get_Stream_by_batch($ref);
235 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
236 at a time. This is implemented the same way as get_Stream_by_id,
237 but is provided here in keeping with access methods of NCBI
238 modules.
239 Example :
240 Returns : a Bio::SeqIO stream object
241 Args : $ref : either an array reference, a filename, or a filehandle
242 from which to get the list of unique ids/accession numbers.
244 NOTE: deprecated API. Use get_Stream_by_id() instead.
246 =cut
248 *get_Stream_by_batch = sub {
249 my $self = shift;
250 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
251 $self->get_Stream_by_id(@_)
254 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
256 =cut
258 =head2 get_request
260 Title : get_request
261 Usage : my $url = $self->get_request
262 Function: returns a HTTP::Request object
263 Returns :
264 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
266 =cut
268 sub get_request {
269 my ($self, @qualifiers) = @_;
270 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
271 @qualifiers);
273 if( !defined $uids ) {
274 $self->throw("Must specify a value for uids to query");
276 my ($f,undef) = $self->request_format($format);
278 my %vars = (
279 @{$HOSTS{$self->servertype}->{'basevars'}},
280 ( 'format' => $f )
283 my $url = $self->location_url;
285 my $uid;
286 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
287 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
289 if( ref($uids) =~ /ARRAY/i ) {
290 # HTTP::Request automagically converts the ' ' to %20
291 $uid = join($jointype, @$uids);
292 } else {
293 $uid = $uids;
295 $vars{$idvar} = $uid;
297 return POST $url, \%vars;
300 =head2 postprocess_data
302 Title : postprocess_data
303 Usage : $self->postprocess_data ( 'type' => 'string',
304 'location' => \$datastr);
305 Function: process downloaded data before loading into a Bio::SeqIO
306 Returns : void
307 Args : hash with two keys - 'type' can be 'string' or 'file'
308 - 'location' either file location or string
309 reference containing data
311 =cut
313 # don't need to do anything
315 sub postprocess_data {
316 my ($self, %args) = @_;
317 return;
320 =head2 default_format
322 Title : default_format
323 Usage : my $format = $self->default_format
324 Function: Returns default sequence format for this module
325 Returns : string
326 Args : none
328 =cut
330 sub default_format {
331 return $DEFAULTFORMAT;
334 =head2 Bio::DB::SwissProt specific routines
336 =cut
338 =head2 servertype
340 Title : servertype
341 Usage : my $servertype = $self->servertype
342 $self->servertype($servertype);
343 Function: Get/Set server type
344 Returns : string
345 Args : server type string [optional]
347 =cut
349 sub servertype {
350 my ($self, $servertype) = @_;
351 if( defined $servertype && $servertype ne '') {
352 $self->throw("You gave an invalid server type ($servertype)".
353 " - available types are ".
354 keys %HOSTS) unless( $HOSTS{$servertype} );
355 $self->{'_servertype'} = $servertype;
356 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
358 # make sure format is reset properly in that different
359 # servers have different syntaxes
360 my ($existingformat,$seqioformat) = $self->request_format;
361 $self->request_format($existingformat);
363 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
367 =head2 hostlocation
369 Title : hostlocation
370 Usage : my $location = $self->hostlocation()
371 $self->hostlocation($location)
372 Function: Set/Get Hostlocation
373 Returns : string representing hostlocation
374 Args : string specifying hostlocation [optional]
376 =cut
378 sub hostlocation {
379 my ($self, $location ) = @_;
380 $location = lc $location;
381 my $servertype = $self->servertype;
382 $self->throw("Must have a valid servertype defined not $servertype")
383 unless defined $servertype;
384 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
385 if( defined $location && $location ne '' ) {
386 if( ! $hosts{$location} ) {
387 $self->throw("Must specify a known host, not $location,".
388 " possible values (".
389 join(",", sort keys %hosts ). ")");
391 $self->{'_hostlocation'} = $location;
393 return $self->{'_hostlocation'};
396 =head2 location_url
398 Title : location
399 Usage : my $url = $self->location_url()
400 Function: Get host url
401 Returns : string representing url
402 Args : none
404 =cut
406 sub location_url {
407 my ($self) = @_;
408 my $servertype = $self->servertype();
409 my $location = $self->hostlocation();
411 if( ! defined $location || !defined $servertype ) {
412 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
414 return sprintf($HOSTS{$servertype}->{'baseurl'},
415 $HOSTS{$servertype}->{'hosts'}->{$location});
418 =head2 request_format
420 Title : request_format
421 Usage : my ($req_format, $ioformat) = $self->request_format;
422 $self->request_format("genbank");
423 $self->request_format("fasta");
424 Function: Get/Set sequence format retrieval. The get-form will normally
425 not be used outside of this and derived modules.
426 Returns : Array of two strings, the first representing the format for
427 retrieval, and the second specifying the corresponding SeqIO
428 format.
429 Args : $format = sequence format
431 =cut
433 sub request_format {
434 my ($self, $value) = @_;
435 if( defined $value ) {
436 if( $self->servertype =~ /expasy/ ) {
437 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
438 $self->{'_format'} = [ 'sprot', 'swiss'];
439 } elsif( $value =~ /^fa/ ) {
440 $self->{'_format'} = [ 'fasta', 'fasta'];
441 } else {
442 $self->warn("Unrecognized format $value requested");
443 $self->{'_format'} = [ 'fasta', 'fasta'];
445 } elsif( $self->servertype =~ /ebi/ ) {
446 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
447 $self->{'_format'} = [ 'swissprot', 'swiss' ];
448 } elsif( $value =~ /^fa/ ) {
449 $self->{'_format'} = [ 'fasta', 'fasta'];
450 } else {
451 $self->warn("Unrecognized format $value requested");
452 $self->{'_format'} = [ 'swissprot', 'swiss'];
456 return @{$self->{'_format'}};
459 =head2 idtracker
461 Title : idtracker
462 Usage : my ($newid) = $self->idtracker($oldid);
463 Function: Retrieve new ID using old ID.
464 Returns : single ID if one is found
465 Args : ID to look for
467 =cut
469 sub idtracker {
470 my ($self, $id) = @_;
471 return unless defined $id;
472 my $st = $self->servertype;
473 my $base = ($st eq 'expasy') ? "http://".$HOSTS{$st}->{'hosts'}->{$self->hostlocation}
474 : $DEFAULTIDTRACKER;
475 my $url = $base.'/cgi-bin/idtracker?id='.$id;
476 my $response;
477 eval {$response = $self->ua->get($url)};
478 if ($@ || $response->is_error) {
479 my $error = $@ || $response->error_as_HTML;
480 $self->throw("Error:\n".$error);
482 if ($response->content =~ /was renamed to <b>(.*?)<\/b>/) {
483 return $1;
484 } elsif ($response->content =~ /<tr><th>Entry name<\/th><th>Accession number<\/th><th>Release created<\/th><\/tr>/){
485 # output indicates no mapping needed, return original ID
486 return $id;
487 } else {
488 $self->warn("Unknown response:\n".$response->content);
489 return
495 __END__