added bio::ext support
[bioperl-live.git] / scripts / bpfetch.pl
blob38faf3742c9e0da1803c06b6aef80a219728c507
1 #!/usr/local/bin/perl
4 =head1 NAME
6 bpfetch.pl - fetches sequences from bioperl indexed databases
8 =head1 SYNOPSIS
10 bpfetch.pl swiss:ROA1_HUMAN
12 bpfetch.pl net::genbank:X47072
14 bpfetch.pl net::genpept:ROA1_HUMAN
16 bpfetch.pl ace::myserver.somewhere.edu,21000:X56676
18 bpfetch.pl -fmt GCG swiss:ROA1_HUMAN
20 =head1 DESCRIPTION
22 Fetches sequences using the DB access systems in Bioperl. The most
23 common use of this is to fetch sequences from bioperl indices built
24 using bpindex.pl, or to fetch sequences from the NCBI website
26 The format for retrieving sequences is delibrately like GCG/EMBOSS
27 format, going
29 db:name
31 with the potential of putting in a 'meta' database type, being
33 meta::db:name
35 The meta information can be one of three types
37 local - local indexed flat file database
38 net - networked http: based database
39 ace - ACeDB database
41 This information defaults to 'local' for database names with no meta
42 db information
44 =head1 OPTIONS
46 -fmt <format> - Output format
47 Fasta (default), EMBL, Raw or GCG
48 -acc - string is an accession number, not an
49 id.
51 options only for expert use
53 -dir <dir> - directory to find the index files
54 (overrides BIOPERL_INDEX environment varaible)
55 -type <type> - type of DBM file to open
56 (overrides BIOPERL_INDEX_TYPE environment variable)
58 =head1 ENVIRONMENT
60 bpindex and bpfetch coordinate where the databases lie using the
61 enviroment variable BIOPERL_INDEX. This can be overridden using the
62 -dir option. The index type (SDBM or DB_File or another index file)
63 is controlled by the BIOPERL_INDEX_TYPE variable. This defaults to
64 SDBM_File
66 =head1 USING IT YOURSELF
68 bpfetch is a wrapper around the bioperl modules which support
69 the Bio::DB::BioSeqI abstract interface. These include:
71 Author Code
73 James Gilbert - Fasta indexer, Abstract indexer
74 Aaron Mackay - GenBank and GenPept DB access
75 Ewan Birney - EMBL .dat indexer
76 Many people - SeqIO code
78 These modules can be used directly, which is far better than using
79 this script as a system call or a pipe to read from. Read the
80 source code for bpfetch to see how it is used.
82 =head1 EXTENDING IT
84 bpfetch uses a number of different modules to provide access to
85 databases. Any module which subscribes to the Bio::DB::BioSeqI
86 interface can be used here. For flat file indexers, this is
87 best done by extending Bio::Index::Abstract, as is done in
88 Bio::Index::EMBL and Bio::Index::Fasta. For access to other
89 databases you will need to roll your own interface.
91 For new output formats, you need to add a new SeqIO module. The
92 easiest thing is to look at Bio::SeqIO::Fasta and figure out
93 how to hack it for your own format (call it something different
94 obviously).
96 =head1 FEEDBACK
98 =head2 Mailing Lists
100 User feedback is an integral part of the evolution of this and other
101 Bioperl modules. Send your comments and suggestions preferably to one
102 of the Bioperl mailing lists. Your participation is much appreciated.
104 vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
105 vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
106 http://bio.perl.org/MailList.html - About the mailing lists
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted via email
112 or the web:
114 bioperl-bugs@bio.perl.org
115 http://bio.perl.org/bioperl-bugs/
117 =head1 AUTHOR
119 Ewan Birney, birney@sanger.ac.uk
121 =cut
123 use strict;
124 use Getopt::Long;
127 # Dofus catcher for people who are trying this script without
128 # installing bioperl. In your own script, you can just go
130 # use Bio::Index::Fasta etc, rather than this
133 BEGIN {
134 eval {
135 require Bio::Index::Fasta;
136 require Bio::Index::EMBL;
137 require Bio::DB::GenBank;
138 require Bio::DB::GenPept;
139 require Bio::SeqIO;
142 if ( $@ ) {
143 # one up from here is Bio directory - we hope!
144 push(@INC,"..");
145 eval {
146 require Bio::Index::Fasta;
147 require Bio::Index::EMBL;
148 require Bio::DB::GenBank;
149 require Bio::DB::GenPept;
150 require Bio::SeqIO;
152 if ( $@ ) {
153 print STDERR ("\nbpindex cannot find Bio::Index::Fasta and Bio::Index::EMBL\nbpindex needs to have bioperl installed for it to run.\nBioperl is very easy to install\nSee http://bio.perl.org for more information\n\n");
154 exit(1);
155 } else {
156 print STDERR ("\nYou are running bpindex.pl without installing bioperl.\nYou have done it from bioperl/scripts, and so we can find the necessary information\nbut it is much better to install bioperl\n\nPlease read the README in the bioperl distribution\n\n");
162 # Start processing the command line
165 my $dir = $ENV{'BIOPERL_INDEX'};
166 my $type = $ENV{'BIOPER_INDEX_TYPE'};
167 my $fmt = 'Fasta';
168 my $useacc = 0;
169 my $ret = GetOptions('dir=s' => \$dir,'fmt=s' => \$fmt , 'type=s' => \$type , 'acc!' => \$useacc);
172 # print pod documentation if we have no arguments
175 exec('perldoc',$0) unless @ARGV;
177 my($isnet,$db,$dbobj,$id,$seq,$seqio,$out,$meta);
180 # Reset the type if needed
183 if( $type ) {
184 $Bio::Index::Abstract::USE_DBM_TYPE = $type;
188 # Build at run time the SeqIO output
191 $out = Bio::SeqIO->new(-fh => \*STDOUT , -format => $fmt);
194 # Main loop over remaining arguments
197 foreach my $arg ( @ARGV ) {
198 $_= $arg;
201 # strip out meta:: if there
202 if( /^(\w+)::/ ) {
203 $meta = $1;
204 s/^(\w+):://;
205 } else {
206 $meta = 'local';
209 # parse to db:id
211 /^(\S+)\:(\S+)$/ || do { print STDERR "$_ is not parsed as db:name\n"; next;};
212 $db = $1;
213 $id = $2;
216 # the eval block catches exceptions if they occur
217 # in the code in the block. The exception goes in $@
220 eval {
221 SWITCH : {
222 $_ = $meta;
223 /^net$/ && do {
224 if( $db =~ /genbank/ ) {
225 $dbobj = Bio::DB::GenBank->new();
227 elsif( $db =~ /genpept/ ) {
228 $dbobj = Bio::DB::GenPept->new();
229 } else {
230 die "Net database $db not available";
232 last SWITCH;
234 /^ace$/ && do {
236 # yank in Bio::DB::Ace at runtime
237 eval {
238 require Bio::DB::Ace;
240 if ( $@ ) {
241 die "Unable to load Bio::DB::Ace for ace::$db\n\n$@\n";
244 # db is server,port
245 my ($server,$port);
247 $db =~ /(\S+)\,(\d+)/ || die "$db is not server.name,port for acedb database";
248 $server = $1;
249 $port = $2;
250 # print STDERR "Connecting to $server,$port\n";
252 $dbobj = Bio::DB::Ace->new(-host => $server, -port => $port);
253 last SWITCH;
255 /^local$/ && do {
256 if( !$dir ) {
257 die "\nNo directory specified for index\nDirectory must be specified by the environment varaible BIOPERL_INDEX or --dir option\ngo bpindex with no arguments for more help\n\n";
261 # $db gets re-blessed to the correct index when
262 # it is made from the abstract class. Cute eh?
265 $dbobj = Bio::Index::Abstract->new("$dir/$db");
266 last SWITCH;
268 die "Meta database $meta is not valid";
270 }; # end of eval to get db
271 if( $@ ) {
272 warn("Database $db in $arg is not loadable. Skipping\n\nError $@");
273 next;
277 # We expect the databases to adhere to the BioSeqI
278 # the sequence index databases and the GenBank/GenPept do already
281 if( ! $dbobj->isa('Bio::DB::BioSeqI') ) {
282 warn("$db in $arg does not inherit from Bio::DB::BioSeqI, so is not expected to work under the DB guidlines. Going to try it anyway");
285 eval {
286 if( $useacc == 0 ) {
287 $seq = $dbobj->get_Seq_by_id($id);
288 } else {
289 $seq = $dbobj->get_Seq_by_acc($id);
293 if( $@ ) {
294 warn("Sequence $id in Database $db in $arg is not loadable. Skipping.\n\nError $@");
295 next;
298 $out->write_seq($seq);