4 # BioPerl module for Bio::DB::SwissProt
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
13 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
17 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
21 use Bio::DB::SwissProt;
23 $sp = Bio::DB::SwissProt->new();
25 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
26 # <4-letter-identifier>_<species 5-letter code>
28 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
32 # In fact in this implementation
33 # these methods call the same webscript so you can use
34 # then interchangeably
36 # choose a different server to query
37 $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
38 '-hostlocation' => 'us');
40 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
44 SwissProt is a curated database of proteins managed by the Swiss
45 Bioinformatics Institute. Additional tools for
46 parsing and manipulating swissprot files can be found at
47 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
49 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
50 SwissProt database via an Expasy retrieval.
52 In order to make changes transparent we have host type (currently only
53 expasy) and location (default to Switzerland) separated out. This
54 allows the user to pick the closest Expasy mirror for running their
62 User feedback is an integral part of the evolution of this and other
63 Bioperl modules. Send your comments and suggestions preferably to one
64 of the Bioperl mailing lists. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 the bugs and their resolution. Bug reports can be submitted via the
76 http://bugzilla.open-bio.org/
78 =head1 AUTHOR - Jason Stajich
80 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
82 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
83 Institute of Bioinformatics for helping point us in the direction of
84 the correct expasy scripts and for swissknife references.
86 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt>
87 for help with adding EBI swall server.
91 The rest of the documentation details each of the object
92 methods. Internal methods are usually preceded with a _
96 # Let the code begin...
98 package Bio
::DB
::SwissProt
;
100 use vars
qw($MODVERSION %HOSTS $DEFAULTFORMAT $DEFAULTSERVERTYPE);
102 $MODVERSION = '0.8.1';
103 use HTTP::Request::Common;
105 use base qw(Bio::DB::WebDBSeqI);
108 $DEFAULTSERVERTYPE = 'ebi';
109 $DEFAULTFORMAT = 'swissprot';
111 # you can add your own here theoretically.
115 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
118 'switzerland' => 'ch.expasy.org',
119 'canada' => 'ca.expasy.org',
120 'china' => 'cn.expasy.org',
121 'taiwan' => 'tw.expasy.org',
122 'australia' => 'au.expasy.org',
123 'korea' => 'kr.expasy.org',
124 'us' => 'us.expasy.org',
133 'baseurl' => 'http://%s/cgi-bin/dbfetch',
135 'uk' => 'www.ebi.ac.uk',
139 'basevars' => [ 'db' => 'UniProtKB',
144 # new modules should be a little more lightweight and
145 # should use Bio::Root::Root
147 my ($class, @args) = @_;
148 my $self = $class->SUPER::new
(@args);
150 my ($format, $hostlocation,$servertype) =
151 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
154 if( $format && $format !~ /(swiss)|(fasta)/i ) {
155 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
156 $format = $self->default_format;
158 $servertype = $DEFAULTSERVERTYPE unless $servertype;
159 $servertype = lc $servertype;
160 $self->servertype($servertype);
161 if ( $hostlocation ) {
162 $self->hostlocation(lc $hostlocation);
165 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
169 =head2 Routines from Bio::DB::RandomAccessI
175 Title : get_Seq_by_id
176 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
177 Function: Gets a Bio::Seq object by its name
178 Returns : a Bio::Seq object
179 Args : the id (as a string) of a sequence
180 Throws : "id does not exist" exception
184 =head2 get_Seq_by_acc
186 Title : get_Seq_by_acc
187 Usage : $seq = $db->get_Seq_by_acc('X77802');
188 Function: Gets a Bio::Seq object by accession number
189 Returns : A Bio::Seq object
190 Args : accession number (as a string)
191 Throws : "acc does not exist" exception
195 =head2 get_Stream_by_id
197 Title : get_Stream_by_id
198 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
199 Function: Gets a series of Seq objects by unique identifiers
200 Returns : a Bio::SeqIO stream object
201 Args : $ref : a reference to an array of unique identifiers for
202 the desired sequence entries
206 =head2 get_Stream_by_acc
208 Title : get_Stream_by_acc
209 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
210 Function: Gets a series of Seq objects by accession numbers
211 Returns : a Bio::SeqIO stream object
212 Args : $ref : a reference to an array of accession numbers for
213 the desired sequence entries
214 Note : For GenBank, this just calls the same code for get_Stream_by_id()
218 =head2 get_Stream_by_batch
220 Title : get_Stream_by_batch
221 Usage : $seq = $db->get_Stream_by_batch($ref);
222 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
223 at a time. This is implemented the same way as get_Stream_by_id,
224 but is provided here in keeping with access methods of NCBI
227 Returns : a Bio::SeqIO stream object
228 Args : $ref : either an array reference, a filename, or a filehandle
229 from which to get the list of unique ids/accession numbers.
231 NOTE: deprecated API. Use get_Stream_by_id() instead.
235 *get_Stream_by_batch
= sub {
237 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
238 $self->get_Stream_by_id(@_)
241 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
248 Usage : my $url = $self->get_request
249 Function: returns a HTTP::Request object
251 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
256 my ($self, @qualifiers) = @_;
257 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
260 if( !defined $uids ) {
261 $self->throw("Must specify a value for uids to query");
263 my ($f,undef) = $self->request_format($format);
266 @
{$HOSTS{$self->servertype}->{'basevars'}},
270 my $url = $self->location_url;
273 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
274 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
276 if( ref($uids) =~ /ARRAY/i ) {
277 # HTTP::Request automagically converts the ' ' to %20
278 $uid = join($jointype, @
$uids);
282 $vars{$idvar} = $uid;
284 return POST
$url, \
%vars;
287 =head2 postprocess_data
289 Title : postprocess_data
290 Usage : $self->postprocess_data ( 'type' => 'string',
291 'location' => \$datastr);
292 Function: process downloaded data before loading into a Bio::SeqIO
294 Args : hash with two keys - 'type' can be 'string' or 'file'
295 - 'location' either file location or string
296 reference containing data
300 # don't need to do anything
302 sub postprocess_data
{
303 my ($self, %args) = @_;
307 =head2 default_format
309 Title : default_format
310 Usage : my $format = $self->default_format
311 Function: Returns default sequence format for this module
318 return $DEFAULTFORMAT;
321 =head2 Bio::DB::SwissProt specific routines
328 Usage : my $servertype = $self->servertype
329 $self->servertype($servertype);
330 Function: Get/Set server type
332 Args : server type string [optional]
337 my ($self, $servertype) = @_;
338 if( defined $servertype && $servertype ne '') {
339 $self->throw("You gave an invalid server type ($servertype)".
340 " - available types are ".
341 keys %HOSTS) unless( $HOSTS{$servertype} );
342 $self->{'_servertype'} = $servertype;
343 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
345 # make sure format is reset properly in that different
346 # servers have different syntaxes
347 my ($existingformat,$seqioformat) = $self->request_format;
348 $self->request_format($existingformat);
350 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
357 Usage : my $location = $self->hostlocation()
358 $self->hostlocation($location)
359 Function: Set/Get Hostlocation
360 Returns : string representing hostlocation
361 Args : string specifying hostlocation [optional]
366 my ($self, $location ) = @_;
367 $location = lc $location;
368 my $servertype = $self->servertype;
369 $self->throw("Must have a valid servertype defined not $servertype")
370 unless defined $servertype;
371 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
372 if( defined $location && $location ne '' ) {
373 if( ! $hosts{$location} ) {
374 $self->throw("Must specify a known host, not $location,".
375 " possible values (".
376 join(",", sort keys %hosts ). ")");
378 $self->{'_hostlocation'} = $location;
380 return $self->{'_hostlocation'};
386 Usage : my $url = $self->location_url()
387 Function: Get host url
388 Returns : string representing url
395 my $servertype = $self->servertype();
396 my $location = $self->hostlocation();
398 if( ! defined $location || !defined $servertype ) {
399 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
401 return sprintf($HOSTS{$servertype}->{'baseurl'},
402 $HOSTS{$servertype}->{'hosts'}->{$location});
405 =head2 request_format
407 Title : request_format
408 Usage : my ($req_format, $ioformat) = $self->request_format;
409 $self->request_format("genbank");
410 $self->request_format("fasta");
411 Function: Get/Set sequence format retrieval. The get-form will normally
412 not be used outside of this and derived modules.
413 Returns : Array of two strings, the first representing the format for
414 retrieval, and the second specifying the corresponding SeqIO
416 Args : $format = sequence format
421 my ($self, $value) = @_;
422 if( defined $value ) {
423 if( $self->servertype =~ /expasy/ ) {
424 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
425 $self->{'_format'} = [ 'sprot', 'swiss'];
426 } elsif( $value =~ /^fa/ ) {
427 $self->{'_format'} = [ 'fasta', 'fasta'];
429 $self->warn("Unrecognized format $value requested");
430 $self->{'_format'} = [ 'fasta', 'fasta'];
432 } elsif( $self->servertype =~ /ebi/ ) {
433 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
434 $self->{'_format'} = [ 'swissprot', 'swiss' ];
435 } elsif( $value =~ /^fa/ ) {
436 $self->{'_format'} = [ 'fasta', 'fasta'];
438 $self->warn("Unrecognized format $value requested");
439 $self->{'_format'} = [ 'swissprot', 'swiss'];
443 return @
{$self->{'_format'}};