scripts/bpfetch.pl

   1 #!/usr/local/bin/perl
   2
   3
   4 =head1 NAME
   5
   6 bpfetch.pl - fetches sequences from bioperl indexed databases
   7
   8 =head1 SYNOPSIS
   9
  10   bpfetch.pl swiss:ROA1_HUMAN
  11
  12   bpfetch.pl net::genbank:X47072
  13
  14   bpfetch.pl net::genpept:ROA1_HUMAN
  15
  16   bpfetch.pl ace::myserver.somewhere.edu,21000:X56676
  17
  18   bpfetch.pl -fmt GCG swiss:ROA1_HUMAN
  19
  20 =head1 DESCRIPTION
  21
  22 Fetches sequences using the DB access systems in Bioperl. The most
  23 common use of this is to fetch sequences from bioperl indices built
  24 using bpindex.pl, or to fetch sequences from the NCBI website
  25
  26 The format for retrieving sequences is delibrately like GCG/EMBOSS
  27 format, going
  28
  29   db:name
  30
  31 with the potential of putting in a 'meta' database type, being
  32
  33   meta::db:name
  34
  35 The meta information can be one of three types
  36
  37   local - local indexed flat file database
  38   net   - networked http: based database
  39   ace   - ACeDB database
  40
  41 This information defaults to 'local' for database names with no meta
  42 db information
  43
  44 =head1 OPTIONS
  45
  46   -fmt  <format> - Output format
  47                    Fasta (default), EMBL, Raw or GCG
  48   -acc           - string is an accession number, not an
  49                    id.
  50
  51 options only for expert use
  52
  53   -dir  <dir>    - directory to find the index files
  54                   (overrides BIOPERL_INDEX environment varaible)
  55   -type <type>   - type of DBM file to open
  56                   (overrides BIOPERL_INDEX_TYPE environment variable)
  57
  58 =head1 ENVIRONMENT
  59
  60 bpindex and bpfetch coordinate where the databases lie using the
  61 enviroment variable BIOPERL_INDEX. This can be overridden using the
  62 -dir option. The index type (SDBM or DB_File or another index file)
  63 is controlled by the BIOPERL_INDEX_TYPE variable. This defaults to
  64 SDBM_File
  65
  66 =head1 USING IT YOURSELF
  67
  68 bpfetch is a wrapper around the bioperl modules which support
  69 the Bio::DB::BioSeqI abstract interface. These include:
  70
  71   Author          Code
  72
  73   James Gilbert - Fasta indexer, Abstract indexer
  74   Aaron Mackay  - GenBank and GenPept DB access
  75   Ewan Birney   - EMBL .dat indexer
  76   Many people   - SeqIO code
  77
  78 These modules can be used directly, which is far better than using
  79 this script as a system call or a pipe to read from. Read the
  80 source code for bpfetch to see how it is used.
  81
  82 =head1 EXTENDING IT
  83
  84 bpfetch uses a number of different modules to provide access to
  85 databases. Any module which subscribes to the Bio::DB::BioSeqI
  86 interface can be used here. For flat file indexers, this is
  87 best done by extending Bio::Index::Abstract, as is done in
  88 Bio::Index::EMBL and Bio::Index::Fasta. For access to other
  89 databases you will need to roll your own interface.
  90
  91 For new output formats, you need to add a new SeqIO module. The
  92 easiest thing is to look at Bio::SeqIO::Fasta and figure out
  93 how to hack it for your own format (call it something different
  94 obviously).
  95
  96 =head1 FEEDBACK
  97
  98 =head2 Mailing Lists
  99
 100 User feedback is an integral part of the evolution of this and other
 101 Bioperl modules.  Send your comments and suggestions preferably to one
 102 of the Bioperl mailing lists.  Your participation is much appreciated.
 103
 104     vsns-bcd-perl@lists.uni-bielefeld.de          - General discussion
 105     vsns-bcd-perl-guts@lists.uni-bielefeld.de     - Technically-oriented discussion
 106     http://bio.perl.org/MailList.html             - About the mailing lists
 107
 108 =head2 Reporting Bugs
 109
 110 Report bugs to the Bioperl bug tracking system to help us keep track
 111 the bugs and their resolution. Bug reports can be submitted via email
 112 or the web:
 113
 114     bioperl-bugs@bio.perl.org
 115     http://bio.perl.org/bioperl-bugs/
 116
 117 =head1 AUTHOR
 118
 119 Ewan Birney, birney@sanger.ac.uk
 120
 121 =cut
 122
 123 use strict;
 124 use Getopt::Long;
 125
 126 #
 127 # Dofus catcher for people who are trying this script without
 128 # installing bioperl. In your own script, you can just go
 129 #
 130 # use Bio::Index::Fasta etc, rather than this
 131 #
 132
 133 BEGIN {
 134     eval {
 135         require Bio::Index::Fasta;
 136         require Bio::Index::EMBL;
 137         require Bio::DB::GenBank;
 138         require Bio::DB::GenPept;
 139         require Bio::SeqIO;
 140
 141     };
 142     if ( $@ ) {
 143         # one up from here is Bio directory - we hope!
 144         push(@INC,"..");
 145         eval {
 146             require Bio::Index::Fasta;
 147             require Bio::Index::EMBL;
 148             require Bio::DB::GenBank;
 149             require Bio::DB::GenPept;
 150             require Bio::SeqIO;
 151         };
 152         if ( $@ ) {
 153             print STDERR ("\nbpindex cannot find Bio::Index::Fasta and Bio::Index::EMBL\nbpindex needs to have bioperl installed for it to run.\nBioperl is very easy to install\nSee http://bio.perl.org for more information\n\n");
 154             exit(1);
 155         } else {
 156             print STDERR ("\nYou are running bpindex.pl without installing bioperl.\nYou have done it from bioperl/scripts, and so we can find the necessary information\nbut it is much better to install bioperl\n\nPlease read the README in the bioperl distribution\n\n");
 157         }
 158     }
 159 }
 160
 161 #
 162 # Start processing the command line
 163 #
 164
 165 my $dir = $ENV{'BIOPERL_INDEX'};
 166 my $type = $ENV{'BIOPER_INDEX_TYPE'};
 167 my $fmt = 'Fasta';
 168 my $useacc = 0;
 169 my $ret = GetOptions('dir=s' => \$dir,'fmt=s' => \$fmt , 'type=s' => \$type , 'acc!' => \$useacc);
 170
 171 #
 172 # print pod documentation if we have no arguments
 173 #
 174
 175 exec('perldoc',$0) unless @ARGV;
 176
 177 my($isnet,$db,$dbobj,$id,$seq,$seqio,$out,$meta);
 178
 179 #
 180 # Reset the type if needed
 181 #
 182
 183 if( $type ) {
 184    $Bio::Index::Abstract::USE_DBM_TYPE = $type;
 185 }
 186
 187 #
 188 # Build at run time the SeqIO output
 189 #
 190
 191 $out = Bio::SeqIO->new(-fh => \*STDOUT , -format => $fmt);
 192
 193 #
 194 # Main loop over remaining arguments
 195 #
 196
 197 foreach my $arg ( @ARGV ) {
 198     $_= $arg;
 199
 200
 201     # strip out meta:: if there
 202     if( /^(\w+)::/ ) {
 203         $meta = $1;
 204         s/^(\w+):://;
 205     } else {
 206         $meta = 'local';
 207     }
 208
 209     # parse to db:id
 210
 211     /^(\S+)\:(\S+)$/ || do { print STDERR "$_ is not parsed as db:name\n"; next;};
 212     $db = $1;
 213     $id = $2;
 214
 215     #
 216     # the eval block catches exceptions if they occur
 217     # in the code in the block. The exception goes in $@
 218     #
 219
 220     eval {
 221         SWITCH : {
 222             $_ = $meta;
 223             /^net$/ && do {
 224                 if( $db =~ /genbank/ ) {
 225                     $dbobj = Bio::DB::GenBank->new();
 226                 }
 227                 elsif( $db =~ /genpept/ ) {
 228                     $dbobj = Bio::DB::GenPept->new();
 229                 } else {
 230                     die "Net database $db not available";
 231                 }
 232                 last SWITCH;
 233             };
 234             /^ace$/ && do {
 235
 236                 # yank in Bio::DB::Ace at runtime
 237                 eval {
 238                     require Bio::DB::Ace;
 239                 };
 240                 if ( $@ ) {
 241                     die "Unable to load Bio::DB::Ace for ace::$db\n\n$@\n";
 242                 }
 243
 244                 # db is server,port
 245                 my ($server,$port);
 246
 247                 $db =~ /(\S+)\,(\d+)/ || die "$db is not server.name,port for acedb database";
 248                 $server = $1;
 249                 $port = $2;
 250                 # print STDERR "Connecting to $server,$port\n";
 251
 252                 $dbobj = Bio::DB::Ace->new(-host => $server, -port => $port);
 253                 last SWITCH;
 254             };
 255             /^local$/ && do {
 256                 if( !$dir ) {
 257                     die "\nNo directory specified for index\nDirectory must be specified by the environment varaible BIOPERL_INDEX or --dir option\ngo bpindex with no arguments for more help\n\n";
 258                 }
 259
 260                 #
 261                 # $db gets re-blessed to the correct index when
 262                 # it is made from the abstract class. Cute eh?
 263                 #
 264
 265                 $dbobj = Bio::Index::Abstract->new("$dir/$db");
 266                 last SWITCH;
 267             };
 268             die "Meta database $meta is not valid";
 269         }
 270     }; # end of eval to get db
 271     if( $@ ) {
 272         warn("Database $db in $arg is not loadable. Skipping\n\nError $@");
 273         next;
 274     }
 275
 276     #
 277     # We expect the databases to adhere to the BioSeqI
 278     # the sequence index databases and the GenBank/GenPept do already
 279     #
 280
 281     if( ! $dbobj->isa('Bio::DB::BioSeqI') ) {
 282         warn("$db in $arg does not inherit from Bio::DB::BioSeqI, so is not expected to work under the DB guidlines. Going to try it anyway");
 283     }
 284
 285     eval {
 286         if( $useacc == 0 ) {
 287             $seq = $dbobj->get_Seq_by_id($id);
 288         } else {
 289             $seq = $dbobj->get_Seq_by_acc($id);
 290         }
 291
 292     };
 293     if( $@ ) {
 294         warn("Sequence $id in Database $db in $arg is not loadable. Skipping.\n\nError $@");
 295         next;
 296     }
 297
 298     $out->write_seq($seq);
 299 }
 300
 301
 302
 303
 304
 305
 306