Updated http URLs to https
[bioperl-live.git] / scripts / DB / bp_biofetch_genbank_proxy.pl
blobd21de8313d51fff63d95829c36cf375505cb53d1
1 #!perl
3 # dbfetch style caching proxy for GenBank
4 use strict;
5 use warnings;
6 use CGI qw(:standard);
7 use HTTP::Request::Common;
8 use LWP::UserAgent;
9 use Cache::FileCache;
11 use vars qw(%GOT $BUFFER %MAPPING $CACHE);
13 use constant CACHE_LOCATION => '/usr/tmp/dbfetch_cache';
14 use constant MAX_SIZE => 100_000_000; # 100 megs, roughly
15 use constant CACHE_DEPTH => 4;
16 use constant EXPIRATION => "1 week";
17 use constant PURGE => "1 hour";
19 %MAPPING = (genbank => {db=>'nucleotide',
20 rettype => 'gb'},
21 genpep => {db=>'protein',
22 rettype => 'gp'});
23 # we're doing everything in callbacks, so initialize globals.
24 $BUFFER = '';
25 %GOT = ();
27 print header('text/plain');
29 param() or print_usage();
31 my $db = param('db');
32 my $style = param('style');
33 my $format = param('format');
34 my $id = param('id');
35 my @ids = split /\s+/,$id;
37 $format = 'genbank' if $format eq 'default'; #h'mmmph
39 $MAPPING{$db} or error(1=>"Unknown database [$db]");
40 $style eq 'raw' or error(2=>"Unknown style [$style]");
41 $format eq 'genbank' or error(3=>"Format [$format] not known for database [$db]");
43 $CACHE = Cache::FileCache->new({cache_root => CACHE_LOCATION,
44 default_expires_in => EXPIRATION,
45 cache_DEPTH => CACHE_DEPTH,
46 namespace => 'dbfetch',
47 auto_purge_interval => PURGE});
49 # handle cached entries
50 foreach (@ids) {
51 if (my $obj = $CACHE->get($_)) {
52 $GOT{$_}++;
53 print $obj,"//\n";
57 # handle the remainder
58 @ids = grep {!$GOT{$_}} @ids;
59 if (@ids) {
60 my $request = POST('https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
61 [rettype => $MAPPING{$db}{rettype},
62 db => $MAPPING{$db}{db},
63 tool => 'bioperl',
64 retmode => 'text',
65 usehistory => 'n',
66 id => join(',',@ids),
70 my $ua = LWP::UserAgent->new;
71 my $response = $ua->request($request,\&callback);
73 if ($response->is_error) {
74 my $status = $response->status_line;
75 error(6 => "HTTP error from GenBank [$status]");
79 my @missing_ids = grep {!$GOT{$_}} @ids;
80 foreach (@missing_ids) {
81 error(4=>"ID [$_] not found in database [$db]",1);
84 # my $response = $response->content;
86 sub process_record {
87 my $record = shift;
88 print "$record//\n";
89 my ($locus) = $record =~ /^LOCUS\s+(\S+)/m;
90 my ($accession) = $record =~ /^ACCESSION\s+(\S+)/m;
91 my ($version,$gi) = $record =~ /^VERSION\s+(\S+)\s+GI:(\d+)/m;
92 foreach ($locus,$accession,$version,$gi) {
93 $GOT{$_}++;
94 $CACHE->set($_,$record);
98 sub callback {
99 my $data = shift;
100 $BUFFER .= $data;
101 my $index = 0;
102 while (($index = index($BUFFER,"//\n\n",$index))>=0) {
103 my $record = substr($BUFFER,0,$index);
104 $index += length("//\n\n");
105 substr($BUFFER,0,$index) = '';
106 process_record($record);
112 sub print_usage {
113 print <<'END';
114 This script is intended to be used non-interactively.
116 Brief summary of arguments:
119 This interface does not specify what happens when biofetch is called
120 in interactive context. The implementations can return the entries
121 decorated with HTML tags and hypertext links.
123 A URL for biofetch consists of four sections:
125 e.g.
126 1. protocol http://
127 2. host www.ebi.ac.uk
128 3. path to program /Tools/dbfetch/dbfetch
129 4. query string ?style=raw;format=embl;db=embl;id=J00231
132 QUERY STRING
134 The query string options are separated from the base URL (protocol +
135 host + path) by a question mark (?) and from each other by a semicolon
136 ';' (or by ampersand '&'). See CGI GET documents at
137 http://www.w3.org/CGI/). The order of options is not critical. It is
138 recommended to leave the ID to be the last item.
140 Input for options should be case insensitive.
143 option: db
145 Option : db
146 Descr : database name
147 Type : required
148 Usage : db=genpep | db=genbank
149 Arg : string
151 Currently this server accepts "genbank" and "genpep"
153 option: style
155 Option : style
156 Descr : +/- HTML tags
157 Type : required
158 Usage : style=raw | db=html
159 Arg : enum (raw|html)
161 In non-interactive context, always give "style=raw". This uses
162 "Content-Type: text/plain". If other content types are needed (XML),
163 this part of the spesifications can be extended to accommodate them.
165 This server only accepts "raw".
168 option: format
170 Option : format
171 Descr : format of the database entries returned
172 Type : optional
173 Usage : format=genbank
174 Arg : enum
176 Format defaults to the distribution format of the database (embl for
177 EMBL database). If some other supported format is needed this option
178 is needed (E.g. formats for EMBL: fasta, bsml, agave).
180 This server only accepts "genbank" format.
182 option: id
184 Option : id
185 Descr : unique database identifier(s)
186 Type : required
187 Usage : db=J00231 | id=J00231+HSFOS
188 Arg : string
190 The ID option should be able to process all UIDS in a database. It
191 should not be necessary to know if the UID is an ID, accession number
192 or accession.version.
194 The number of entry UIDs allowed is implementation specific. If the
195 limit is exceeded, the the program reports an error. The UIDs should
196 be separated by spaces (use '+' in a GET method string).
199 ERROR MESSAGES
201 The following standardized one line messages should be printed out in
202 case of an error.
204 ERROR 1 Unknown database [$db].
205 ERROR 2 Unknown style [$style].
206 ERROR 3 Format [$format] not known for database [$db].
207 ERROR 4 ID [$id] not found in database [$db].
208 ERROR 5 Too many IDs [$count]. Max [$MAXIDS] allowed.
213 exit 0;
216 sub error {
217 my ($code,$message,$noexit) = @_;
218 print "ERROR $code $message\n";
219 exit 0 unless $noexit;
222 __END__
224 =head1 NAME
226 bp_biofetch_genbank_proxy.pl - Caching BioFetch-compatible web proxy for GenBank
228 =head1 SYNOPSIS
230 Install in cgi-bin directory of a Web server. Stand back.
232 =head1 DESCRIPTION
234 This CGI script acts as the server side of the BioFetch protocol as
235 described in http://obda.open-bio.org/Specs/. It provides two
236 database access services, one for data source "genbank" (nucleotide
237 entries) and the other for data source "genpep" (protein entries).
239 This script works by forwarding its requests to NCBI's eutils script,
240 which lives at https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi.
241 It then reformats the output according to the BioFetch format so the
242 sequences can be processed and returned by the Bio::DB::BioFetch
243 module. Returned entries are temporarily cached on the Web server's
244 file system, allowing frequently-accessed entries to be retrieved
245 without another round trip to NCBI.
247 =head2 INSTALLATION
249 You must have the following installed in order to run this script:
251 1) perl
252 2) the perl modules LWP and Cache::FileCache
253 3) a web server (Apache recommended)
255 To install this script, copy it into the web server's cgi-bin
256 directory. You might want to shorten its name; "dbfetch" is
257 recommended.
259 There are several constants located at the top of the script that you
260 may want to adjust. These are:
262 CACHE_LOCATION
264 This is the location on the filesystem where the cached files will be
265 located. The default is /usr/tmp/dbfetch_cache.
267 MAX_SIZE
269 This is the maximum size that the cache can grow to. When the cache
270 exceeds this size older entries will be deleted automatically. The
271 default setting is 100,000,000 bytes (100 MB).
273 EXPIRATION
275 Entries that haven't been accessed in this length of time will be
276 removed from the cache. The default is 1 week.
278 PURGE
280 This constant specifies how often the cache will be purged for older
281 entries. The default is 1 hour.
283 =head1 TESTING
285 To see if this script is performing as expected, you may test it with
286 this script:
288 use Bio::DB::BioFetch;
289 my $db = Bio::DB::BioFetch->new(-baseaddress=>'http://localhost/cgi-bin/dbfetch',
290 -format =>'genbank',
291 -db =>'genbank');
292 my $seq = $db->get_Seq_by_id('DDU63596');
293 print $seq->seq,"\n";
295 This should print out a DNA sequence.
297 =head1 SEE ALSO
299 L<Bio::DB::BioFetch>, L<Bio::DB::Registry>
301 =head1 AUTHOR
303 Lincoln Stein, E<lt>lstein-at-cshl.orgE<gt>
305 Copyright (c) 2003 Cold Spring Harbor Laboratory
307 This library is free software; you can redistribute it and/or modify
308 it under the same terms as Perl itself. See DISCLAIMER.txt for
309 disclaimers of warranty.
311 =cut