4 # BioPerl module for Bio::DB::SwissProt
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Jason Stajich <jason@bioperl.org>
10 # Copyright Jason Stajich
12 # You may distribute this module under the same terms as perl itself
14 # POD documentation - main docs before the code
15 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
19 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
23 use Bio::DB::SwissProt;
25 $sp = Bio::DB::SwissProt->new();
27 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
28 # <4-letter-identifier>_<species 5-letter code>
30 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
34 # In fact in this implementation
35 # these methods call the same webscript so you can use
36 # then interchangeably
38 # choose a different server to query
39 $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
40 '-hostlocation' => 'us');
42 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
46 SwissProt is a curated database of proteins managed by the Swiss
47 Bioinformatics Institute. Additional tools for
48 parsing and manipulating swissprot files can be found at
49 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
51 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
52 SwissProt database via an Expasy retrieval.
54 In order to make changes transparent we have host type (currently only
55 expasy) and location (default to Switzerland) separated out. This
56 allows the user to pick the closest Expasy mirror for running their
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to one
66 of the Bioperl mailing lists. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 Please direct usage questions or support issues to the mailing list:
76 L<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via the
89 http://bugzilla.open-bio.org/
91 =head1 AUTHOR - Jason Stajich
93 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
95 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
96 Institute of Bioinformatics for helping point us in the direction of
97 the correct expasy scripts and for swissknife references.
99 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt>
100 for help with adding EBI swall server.
104 The rest of the documentation details each of the object
105 methods. Internal methods are usually preceded with a _
109 # Let the code begin...
111 package Bio
::DB
::SwissProt
;
114 use HTTP
::Request
::Common
;
115 our $MODVERSION = '0.8.1';
117 use base
qw(Bio::DB::WebDBSeqI);
120 our $DEFAULTSERVERTYPE = 'ebi';
121 our $DEFAULTFORMAT = 'swissprot';
122 our $DEFAULTIDTRACKER = 'http://www.expasy.ch';
124 # you can add your own here theoretically.
128 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
131 'switzerland' => 'ch.expasy.org',
132 'canada' => 'ca.expasy.org',
133 'china' => 'cn.expasy.org',
134 'taiwan' => 'tw.expasy.org',
135 'australia' => 'au.expasy.org',
136 'korea' => 'kr.expasy.org',
137 'us' => 'us.expasy.org',
146 'baseurl' => 'http://%s/cgi-bin/dbfetch',
148 'uk' => 'www.ebi.ac.uk',
152 'basevars' => [ 'db' => 'UniProtKB',
157 # new modules should be a little more lightweight and
158 # should use Bio::Root::Root
160 my ($class, @args) = @_;
161 my $self = $class->SUPER::new
(@args);
163 my ($format, $hostlocation,$servertype) =
164 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
167 if( $format && $format !~ /(swiss)|(fasta)/i ) {
168 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
169 $format = $self->default_format;
171 $servertype = $DEFAULTSERVERTYPE unless $servertype;
172 $servertype = lc $servertype;
173 $self->servertype($servertype);
174 if ( $hostlocation ) {
175 $self->hostlocation(lc $hostlocation);
178 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
182 =head2 Routines from Bio::DB::RandomAccessI
188 Title : get_Seq_by_id
189 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
190 Function: Gets a Bio::Seq object by its name
191 Returns : a Bio::Seq object
192 Args : the id (as a string) of a sequence
193 Throws : "id does not exist" exception
197 =head2 get_Seq_by_acc
199 Title : get_Seq_by_acc
200 Usage : $seq = $db->get_Seq_by_acc('X77802');
201 Function: Gets a Bio::Seq object by accession number
202 Returns : A Bio::Seq object
203 Args : accession number (as a string)
204 Throws : "acc does not exist" exception
208 =head2 get_Stream_by_id
210 Title : get_Stream_by_id
211 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
212 Function: Gets a series of Seq objects by unique identifiers
213 Returns : a Bio::SeqIO stream object
214 Args : $ref : a reference to an array of unique identifiers for
215 the desired sequence entries
219 =head2 get_Stream_by_acc
221 Title : get_Stream_by_acc
222 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
223 Function: Gets a series of Seq objects by accession numbers
224 Returns : a Bio::SeqIO stream object
225 Args : $ref : a reference to an array of accession numbers for
226 the desired sequence entries
227 Note : For GenBank, this just calls the same code for get_Stream_by_id()
231 =head2 get_Stream_by_batch
233 Title : get_Stream_by_batch
234 Usage : $seq = $db->get_Stream_by_batch($ref);
235 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
236 at a time. This is implemented the same way as get_Stream_by_id,
237 but is provided here in keeping with access methods of NCBI
240 Returns : a Bio::SeqIO stream object
241 Args : $ref : either an array reference, a filename, or a filehandle
242 from which to get the list of unique ids/accession numbers.
244 NOTE: deprecated API. Use get_Stream_by_id() instead.
248 *get_Stream_by_batch
= sub {
250 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
251 $self->get_Stream_by_id(@_)
254 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
261 Usage : my $url = $self->get_request
262 Function: returns a HTTP::Request object
264 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
269 my ($self, @qualifiers) = @_;
270 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
273 if( !defined $uids ) {
274 $self->throw("Must specify a value for uids to query");
276 my ($f,undef) = $self->request_format($format);
279 @
{$HOSTS{$self->servertype}->{'basevars'}},
283 my $url = $self->location_url;
286 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
287 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
289 if( ref($uids) =~ /ARRAY/i ) {
290 # HTTP::Request automagically converts the ' ' to %20
291 $uid = join($jointype, @
$uids);
295 $vars{$idvar} = $uid;
297 return POST
$url, \
%vars;
300 =head2 postprocess_data
302 Title : postprocess_data
303 Usage : $self->postprocess_data ( 'type' => 'string',
304 'location' => \$datastr);
305 Function: process downloaded data before loading into a Bio::SeqIO
307 Args : hash with two keys - 'type' can be 'string' or 'file'
308 - 'location' either file location or string
309 reference containing data
313 # don't need to do anything
315 sub postprocess_data
{
316 my ($self, %args) = @_;
320 =head2 default_format
322 Title : default_format
323 Usage : my $format = $self->default_format
324 Function: Returns default sequence format for this module
331 return $DEFAULTFORMAT;
334 =head2 Bio::DB::SwissProt specific routines
341 Usage : my $servertype = $self->servertype
342 $self->servertype($servertype);
343 Function: Get/Set server type
345 Args : server type string [optional]
350 my ($self, $servertype) = @_;
351 if( defined $servertype && $servertype ne '') {
352 $self->throw("You gave an invalid server type ($servertype)".
353 " - available types are ".
354 keys %HOSTS) unless( $HOSTS{$servertype} );
355 $self->{'_servertype'} = $servertype;
356 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
358 # make sure format is reset properly in that different
359 # servers have different syntaxes
360 my ($existingformat,$seqioformat) = $self->request_format;
361 $self->request_format($existingformat);
363 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
370 Usage : my $location = $self->hostlocation()
371 $self->hostlocation($location)
372 Function: Set/Get Hostlocation
373 Returns : string representing hostlocation
374 Args : string specifying hostlocation [optional]
379 my ($self, $location ) = @_;
380 $location = lc $location;
381 my $servertype = $self->servertype;
382 $self->throw("Must have a valid servertype defined not $servertype")
383 unless defined $servertype;
384 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
385 if( defined $location && $location ne '' ) {
386 if( ! $hosts{$location} ) {
387 $self->throw("Must specify a known host, not $location,".
388 " possible values (".
389 join(",", sort keys %hosts ). ")");
391 $self->{'_hostlocation'} = $location;
393 return $self->{'_hostlocation'};
399 Usage : my $url = $self->location_url()
400 Function: Get host url
401 Returns : string representing url
408 my $servertype = $self->servertype();
409 my $location = $self->hostlocation();
411 if( ! defined $location || !defined $servertype ) {
412 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
414 return sprintf($HOSTS{$servertype}->{'baseurl'},
415 $HOSTS{$servertype}->{'hosts'}->{$location});
418 =head2 request_format
420 Title : request_format
421 Usage : my ($req_format, $ioformat) = $self->request_format;
422 $self->request_format("genbank");
423 $self->request_format("fasta");
424 Function: Get/Set sequence format retrieval. The get-form will normally
425 not be used outside of this and derived modules.
426 Returns : Array of two strings, the first representing the format for
427 retrieval, and the second specifying the corresponding SeqIO
429 Args : $format = sequence format
434 my ($self, $value) = @_;
435 if( defined $value ) {
436 if( $self->servertype =~ /expasy/ ) {
437 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
438 $self->{'_format'} = [ 'sprot', 'swiss'];
439 } elsif( $value =~ /^fa/ ) {
440 $self->{'_format'} = [ 'fasta', 'fasta'];
442 $self->warn("Unrecognized format $value requested");
443 $self->{'_format'} = [ 'fasta', 'fasta'];
445 } elsif( $self->servertype =~ /ebi/ ) {
446 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
447 $self->{'_format'} = [ 'swissprot', 'swiss' ];
448 } elsif( $value =~ /^fa/ ) {
449 $self->{'_format'} = [ 'fasta', 'fasta'];
451 $self->warn("Unrecognized format $value requested");
452 $self->{'_format'} = [ 'swissprot', 'swiss'];
456 return @
{$self->{'_format'}};
462 Usage : my ($newid) = $self->idtracker($oldid);
463 Function: Retrieve new ID using old ID.
464 Returns : single ID if one is found
465 Args : ID to look for
470 my ($self, $id) = @_;
471 return unless defined $id;
472 my $st = $self->servertype;
473 my $base = ($st eq 'expasy') ?
"http://".$HOSTS{$st}->{'hosts'}->{$self->hostlocation}
475 my $url = $base.'/cgi-bin/idtracker?id='.$id;
477 eval {$response = $self->ua->get($url)};
478 if ($@
|| $response->is_error) {
479 my $error = $@
|| $response->error_as_HTML;
480 $self->throw("Error:\n".$error);
482 if ($response->content =~ /was renamed to <b>(.*?)<\/b
>/) {
484 } elsif ($response->content =~ /<tr><th>Entry name<\/th
><th
>Accession number
<\
/th><th>Release created<\/th
><\
/tr>/){
485 # output indicates no mapping needed, return original ID
488 $self->warn("Unknown response:\n".$response->content);