scripts/DB/bp_biofetch_genbank_proxy.pl

   1 #!perl
   2
   3 # dbfetch style caching proxy for GenBank
   4 use strict;
   5 use warnings;
   6 use CGI qw(:standard);
   7 use HTTP::Request::Common;
   8 use LWP::UserAgent;
   9 use Cache::FileCache;
  10
  11 use vars qw(%GOT $BUFFER %MAPPING $CACHE);
  12
  13 use constant CACHE_LOCATION => '/usr/tmp/dbfetch_cache';
  14 use constant MAX_SIZE   => 100_000_000;  # 100 megs, roughly
  15 use constant CACHE_DEPTH => 4;
  16 use constant EXPIRATION => "1 week";
  17 use constant PURGE      => "1 hour";
  18
  19 %MAPPING = (genbank => {db=>'nucleotide',
  20                         rettype => 'gb'},
  21             genpep  => {db=>'protein',
  22                         rettype => 'gp'});
  23 # we're doing everything in callbacks, so initialize globals.
  24 $BUFFER = '';
  25 %GOT    = ();
  26
  27 print header('text/plain');
  28
  29 param() or print_usage();
  30
  31 my $db     = param('db');
  32 my $style  = param('style');
  33 my $format = param('format');
  34 my $id     = param('id');
  35 my @ids    = split /\s+/,$id;
  36
  37 $format = 'genbank' if $format eq 'default';  #h'mmmph
  38
  39 $MAPPING{$db}        or error(1=>"Unknown database [$db]");
  40 $style  eq 'raw'     or error(2=>"Unknown style [$style]");
  41 $format eq 'genbank' or error(3=>"Format [$format] not known for database [$db]");
  42
  43 $CACHE = Cache::FileCache->new({cache_root          => CACHE_LOCATION,
  44                                 default_expires_in  => EXPIRATION,
  45                                 cache_DEPTH         => CACHE_DEPTH,
  46                                 namespace           => 'dbfetch',
  47                                 auto_purge_interval => PURGE});
  48
  49 # handle cached entries
  50 foreach (@ids) {
  51   if (my $obj = $CACHE->get($_)) {
  52     $GOT{$_}++;
  53     print $obj,"//\n";
  54   }
  55 }
  56
  57 # handle the remainder
  58 @ids = grep {!$GOT{$_}} @ids;
  59 if (@ids) {
  60   my $request = POST('https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
  61                      [rettype    => $MAPPING{$db}{rettype},
  62                       db         => $MAPPING{$db}{db},
  63                       tool       => 'bioperl',
  64                       retmode    => 'text',
  65                       usehistory => 'n',
  66                       id         => join(',',@ids),
  67                      ]
  68                     );
  69
  70   my $ua = LWP::UserAgent->new;
  71   my $response = $ua->request($request,\&callback);
  72
  73   if ($response->is_error) {
  74     my $status = $response->status_line;
  75     error(6 => "HTTP error from GenBank [$status]");
  76   }
  77 }
  78
  79 my @missing_ids = grep {!$GOT{$_}} @ids;
  80 foreach (@missing_ids) {
  81   error(4=>"ID [$_] not found in database [$db]",1);
  82 }
  83
  84 # my $response = $response->content;
  85
  86 sub process_record {
  87   my $record = shift;
  88   print "$record//\n";
  89   my ($locus)       = $record =~ /^LOCUS\s+(\S+)/m;
  90   my ($accession)   = $record =~ /^ACCESSION\s+(\S+)/m;
  91   my ($version,$gi) = $record =~ /^VERSION\s+(\S+)\s+GI:(\d+)/m;
  92   foreach ($locus,$accession,$version,$gi) {
  93     $GOT{$_}++;
  94     $CACHE->set($_,$record);
  95   }
  96 }
  97
  98 sub callback {
  99   my $data = shift;
 100   $BUFFER .= $data;
 101   my $index = 0;
 102   while (($index = index($BUFFER,"//\n\n",$index))>=0) {
 103     my $record = substr($BUFFER,0,$index);
 104     $index += length("//\n\n");
 105     substr($BUFFER,0,$index) = '';
 106     process_record($record);
 107   }
 108 }
 109
 110
 111
 112 sub print_usage {
 113   print <<'END';
 114 This script is intended to be used non-interactively.
 115
 116 Brief summary of arguments:
 117 URL
 118
 119 This interface does not specify what happens when biofetch is called
 120 in interactive context. The implementations can return the entries
 121 decorated with HTML tags and hypertext links.
 122
 123 A URL for biofetch consists of four sections:
 124
 125                         e.g.
 126 1. protocol             http://
 127 2. host                 www.ebi.ac.uk
 128 3. path to program      /Tools/dbfetch/dbfetch
 129 4. query string         ?style=raw;format=embl;db=embl;id=J00231
 130
 131
 132 QUERY STRING
 133
 134 The query string options are separated from the base URL (protocol +
 135 host + path) by a question mark (?) and from each other by a semicolon
 136 ';' (or by ampersand '&'). See CGI GET documents at
 137 http://www.w3.org/CGI/). The order of options is not critical. It is
 138 recommended to leave the ID to be the last item.
 139
 140 Input for options should be case insensitive.
 141
 142
 143 option: db
 144
 145   Option  : db
 146   Descr   : database name
 147   Type    : required
 148   Usage   : db=genpep | db=genbank
 149   Arg     : string
 150
 151 Currently this server accepts "genbank" and "genpep"
 152
 153 option: style
 154
 155   Option  : style
 156   Descr   : +/- HTML tags
 157   Type    : required
 158   Usage   : style=raw | db=html
 159   Arg     : enum (raw|html)
 160
 161 In non-interactive context, always give "style=raw". This uses
 162 "Content-Type: text/plain". If other content types are needed (XML),
 163 this part of the spesifications can be extended to accommodate them.
 164
 165 This server only accepts "raw".
 166
 167
 168 option: format
 169
 170   Option  : format
 171   Descr   : format of the database entries returned
 172   Type    : optional
 173   Usage   : format=genbank
 174   Arg     : enum
 175
 176 Format defaults to the distribution format of the database (embl for
 177 EMBL database). If some other supported format is needed this option
 178 is needed (E.g. formats for EMBL: fasta, bsml, agave).
 179
 180 This server only accepts "genbank" format.
 181
 182 option: id
 183
 184   Option  : id
 185   Descr   : unique database identifier(s)
 186   Type    : required
 187   Usage   : db=J00231 | id=J00231+HSFOS
 188   Arg     : string
 189
 190 The ID option should be able to process all UIDS in a database. It
 191 should not be necessary to know if the UID is an ID, accession number
 192 or accession.version.
 193
 194 The number of entry UIDs allowed is implementation specific. If the
 195 limit is exceeded, the the program reports an error. The UIDs should
 196 be separated by spaces (use '+' in a GET method string).
 197
 198
 199 ERROR MESSAGES
 200
 201 The following standardized one line messages should be printed out in
 202 case of an error.
 203
 204 ERROR 1 Unknown database [$db].
 205 ERROR 2 Unknown style [$style].
 206 ERROR 3 Format [$format] not known for database [$db].
 207 ERROR 4 ID [$id] not found in database [$db].
 208 ERROR 5 Too many IDs [$count]. Max [$MAXIDS] allowed.
 209
 210 END
 211 ;
 212
 213 exit 0;
 214 }
 215
 216 sub error {
 217   my ($code,$message,$noexit) = @_;
 218   print "ERROR $code $message\n";
 219   exit 0 unless $noexit;
 220 }
 221
 222 __END__
 223
 224 =head1 NAME
 225
 226 bp_biofetch_genbank_proxy.pl - Caching BioFetch-compatible web proxy for GenBank
 227
 228 =head1 SYNOPSIS
 229
 230   Install in cgi-bin directory of a Web server.  Stand back.
 231
 232 =head1 DESCRIPTION
 233
 234 This CGI script acts as the server side of the BioFetch protocol as
 235 described in http://obda.open-bio.org/Specs/.  It provides two
 236 database access services, one for data source "genbank" (nucleotide
 237 entries) and the other for data source "genpep" (protein entries).
 238
 239 This script works by forwarding its requests to NCBI's eutils script,
 240 which lives at https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi.
 241 It then reformats the output according to the BioFetch format so the
 242 sequences can be processed and returned by the Bio::DB::BioFetch
 243 module.  Returned entries are temporarily cached on the Web server's
 244 file system, allowing frequently-accessed entries to be retrieved
 245 without another round trip to NCBI.
 246
 247 =head2 INSTALLATION
 248
 249 You must have the following installed in order to run this script:
 250
 251    1) perl
 252    2) the perl modules LWP and Cache::FileCache
 253    3) a web server (Apache recommended)
 254
 255 To install this script, copy it into the web server's cgi-bin
 256 directory.  You might want to shorten its name; "dbfetch" is
 257 recommended.
 258
 259 There are several constants located at the top of the script that you
 260 may want to adjust.  These are:
 261
 262 CACHE_LOCATION
 263
 264 This is the location on the filesystem where the cached files will be
 265 located.  The default is /usr/tmp/dbfetch_cache.
 266
 267 MAX_SIZE
 268
 269 This is the maximum size that the cache can grow to.  When the cache
 270 exceeds this size older entries will be deleted automatically.  The
 271 default setting is 100,000,000 bytes (100 MB).
 272
 273 EXPIRATION
 274
 275 Entries that haven't been accessed in this length of time will be
 276 removed from the cache.  The default is 1 week.
 277
 278 PURGE
 279
 280 This constant specifies how often the cache will be purged for older
 281 entries.  The default is 1 hour.
 282
 283 =head1 TESTING
 284
 285 To see if this script is performing as expected, you may test it with
 286 this script:
 287
 288  use Bio::DB::BioFetch;
 289  my $db = Bio::DB::BioFetch->new(-baseaddress=>'http://localhost/cgi-bin/dbfetch',
 290                                  -format     =>'genbank',
 291                                  -db         =>'genbank');
 292  my $seq = $db->get_Seq_by_id('DDU63596');
 293  print $seq->seq,"\n";
 294
 295 This should print out a DNA sequence.
 296
 297 =head1 SEE ALSO
 298
 299 L<Bio::DB::BioFetch>, L<Bio::DB::Registry>
 300
 301 =head1 AUTHOR
 302
 303 Lincoln Stein, E<lt>lstein-at-cshl.orgE<gt>
 304
 305 Copyright (c) 2003 Cold Spring Harbor Laboratory
 306
 307 This library is free software; you can redistribute it and/or modify
 308 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 309 disclaimers of warranty.
 310
 311 =cut
 312